In [10]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
#%matplotlib inline
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore',category=FutureWarning)
warnings.filterwarnings('ignore')
In [4]:
df = pd.read_excel('compactiv.xlsx')
In [5]:
df.head()
Out[5]:
lread lwrite scall sread swrite fork exec rchar wchar pgout ... pgscan atch pgin ppgin pflt vflt runqsz freemem freeswap usr
0 1 0 2147 79 68 0.2 0.2 40671.0 53995.0 0.0 ... 0.0 0.0 1.6 2.6 16.00 26.40 CPU_Bound 4670 1730946 95
1 0 0 170 18 21 0.2 0.2 448.0 8385.0 0.0 ... 0.0 0.0 0.0 0.0 15.63 16.83 Not_CPU_Bound 7278 1869002 97
2 15 3 2162 159 119 2.0 2.4 NaN 31950.0 0.0 ... 0.0 1.2 6.0 9.4 150.20 220.20 Not_CPU_Bound 702 1021237 87
3 0 0 160 12 16 0.2 0.2 NaN 8670.0 0.0 ... 0.0 0.0 0.2 0.2 15.60 16.80 Not_CPU_Bound 7248 1863704 98
4 5 1 330 39 38 0.4 0.4 NaN 12185.0 0.0 ... 0.0 0.0 1.0 1.2 37.80 47.60 Not_CPU_Bound 633 1760253 90

5 rows × 22 columns

In [6]:
df.tail()
Out[6]:
lread lwrite scall sread swrite fork exec rchar wchar pgout ... pgscan atch pgin ppgin pflt vflt runqsz freemem freeswap usr
8187 16 12 3009 360 244 1.6 5.81 405250.0 85282.0 8.02 ... 55.11 0.6 35.87 47.90 139.28 270.74 CPU_Bound 387 986647 80
8188 4 0 1596 170 146 2.4 1.80 89489.0 41764.0 3.80 ... 0.20 0.8 3.80 4.40 122.40 212.60 Not_CPU_Bound 263 1055742 90
8189 16 5 3116 289 190 0.6 0.60 325948.0 52640.0 0.40 ... 0.00 0.4 28.40 45.20 60.20 219.80 Not_CPU_Bound 400 969106 87
8190 32 45 5180 254 179 1.2 1.20 62571.0 29505.0 1.40 ... 18.04 0.4 23.05 24.25 93.19 202.81 CPU_Bound 141 1022458 83
8191 2 0 985 55 46 1.6 4.80 111111.0 22256.0 0.00 ... 0.00 0.2 3.40 6.20 91.80 110.00 CPU_Bound 659 1756514 94

5 rows × 22 columns

In [7]:
df.shape
Out[7]:
(8192, 22)
In [8]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8192 entries, 0 to 8191
Data columns (total 22 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   lread     8192 non-null   int64  
 1   lwrite    8192 non-null   int64  
 2   scall     8192 non-null   int64  
 3   sread     8192 non-null   int64  
 4   swrite    8192 non-null   int64  
 5   fork      8192 non-null   float64
 6   exec      8192 non-null   float64
 7   rchar     8088 non-null   float64
 8   wchar     8177 non-null   float64
 9   pgout     8192 non-null   float64
 10  ppgout    8192 non-null   float64
 11  pgfree    8192 non-null   float64
 12  pgscan    8192 non-null   float64
 13  atch      8192 non-null   float64
 14  pgin      8192 non-null   float64
 15  ppgin     8192 non-null   float64
 16  pflt      8192 non-null   float64
 17  vflt      8192 non-null   float64
 18  runqsz    8192 non-null   object 
 19  freemem   8192 non-null   int64  
 20  freeswap  8192 non-null   int64  
 21  usr       8192 non-null   int64  
dtypes: float64(13), int64(8), object(1)
memory usage: 1.4+ MB
In [9]:
df.describe().T
Out[9]:
count mean std min 25% 50% 75% max
lread 8192.0 1.955969e+01 53.353799 0.0 2.0 7.0 20.000 1845.00
lwrite 8192.0 1.310620e+01 29.891726 0.0 0.0 1.0 10.000 575.00
scall 8192.0 2.306318e+03 1633.617322 109.0 1012.0 2051.5 3317.250 12493.00
sread 8192.0 2.104800e+02 198.980146 6.0 86.0 166.0 279.000 5318.00
swrite 8192.0 1.500582e+02 160.478980 7.0 63.0 117.0 185.000 5456.00
fork 8192.0 1.884554e+00 2.479493 0.0 0.4 0.8 2.200 20.12
exec 8192.0 2.791998e+00 5.212456 0.0 0.2 1.2 2.800 59.56
rchar 8088.0 1.973857e+05 239837.493526 278.0 34091.5 125473.5 267828.750 2526649.00
wchar 8177.0 9.590299e+04 140841.707911 1498.0 22916.0 46619.0 106101.000 1801623.00
pgout 8192.0 2.285317e+00 5.307038 0.0 0.0 0.0 2.400 81.44
ppgout 8192.0 5.977229e+00 15.214590 0.0 0.0 0.0 4.200 184.20
pgfree 8192.0 1.191971e+01 32.363520 0.0 0.0 0.0 5.000 523.00
pgscan 8192.0 2.152685e+01 71.141340 0.0 0.0 0.0 0.000 1237.00
atch 8192.0 1.127505e+00 5.708347 0.0 0.0 0.0 0.600 211.58
pgin 8192.0 8.277960e+00 13.874978 0.0 0.6 2.8 9.765 141.20
ppgin 8192.0 1.238859e+01 22.281318 0.0 0.6 3.8 13.800 292.61
pflt 8192.0 1.097938e+02 114.419221 0.0 25.0 63.8 159.600 899.80
vflt 8192.0 1.853158e+02 191.000603 0.2 45.4 120.4 251.800 1365.00
freemem 8192.0 1.763456e+03 2482.104511 55.0 231.0 579.0 2002.250 12027.00
freeswap 8192.0 1.328126e+06 422019.426957 2.0 1042623.5 1289289.5 1730379.500 2243187.00
usr 8192.0 8.396887e+01 18.401905 0.0 81.0 89.0 94.000 99.00
In [10]:
df.duplicated().sum()
Out[10]:
0
In [11]:
df.isnull().sum()
Out[11]:
lread         0
lwrite        0
scall         0
sread         0
swrite        0
fork          0
exec          0
rchar       104
wchar        15
pgout         0
ppgout        0
pgfree        0
pgscan        0
atch          0
pgin          0
ppgin         0
pflt          0
vflt          0
runqsz        0
freemem       0
freeswap      0
usr           0
dtype: int64
In [12]:
for column in df.columns:
    counts = df[column].value_counts()
    print(f"Value counts for {column}:\n{counts}\n")
Value counts for lread:
lread
1      1050
2       732
0       675
3       539
4       408
       ... 
223       1
254       1
141       1
117       1
129       1
Name: count, Length: 235, dtype: int64

Value counts for lwrite:
lwrite
0      2684
1      1529
2       615
3       284
4       253
       ... 
183       1
138       1
270       1
120       1
267       1
Name: count, Length: 189, dtype: int64

Value counts for scall:
scall
158     10
220     10
419      9
160      9
230      9
        ..
3362     1
4460     1
4765     1
3868     1
5180     1
Name: count, Length: 4115, dtype: int64

Value counts for sread:
sread
16     43
10     41
43     40
12     38
95     37
       ..
671     1
867     1
420     1
772     1
674     1
Name: count, Length: 794, dtype: int64

Value counts for swrite:
swrite
30      56
91      56
24      53
118     51
22      50
        ..
599      1
732      1
419      1
1042     1
612      1
Name: count, Length: 640, dtype: int64

Value counts for fork:
fork
0.20     1999
0.40      966
0.60      716
0.80      563
1.00      398
         ... 
12.38       1
1.78        1
3.56        1
0.59        1
6.37        1
Name: count, Length: 228, dtype: int64

Value counts for exec:
exec
0.20     2060
0.40      595
0.60      571
0.80      453
1.00      344
         ... 
34.33       1
28.34       1
9.15        1
34.67       1
34.47       1
Name: count, Length: 386, dtype: int64

Value counts for rchar:
rchar
452.0       6
6994.0      5
7001.0      5
7018.0      4
425.0       4
           ..
122096.0    1
23110.0     1
49659.0     1
94575.0     1
111111.0    1
Name: count, Length: 7898, dtype: int64

Value counts for wchar:
wchar
18709.0     4
13554.0     3
21962.0     3
25473.0     3
8482.0      3
           ..
82665.0     1
158009.0    1
38142.0     1
25607.0     1
22256.0     1
Name: count, Length: 7925, dtype: int64

Value counts for pgout:
pgout
0.00     4878
0.20      140
0.40      140
0.60      135
0.80      126
         ... 
38.00       1
6.61        1
43.60       1
23.20       1
14.74       1
Name: count, Length: 404, dtype: int64

Value counts for ppgout:
ppgout
0.00     4878
0.40      116
0.60      106
0.20       99
0.80       95
         ... 
45.20       1
40.20       1
45.51       1
65.67       1
55.71       1
Name: count, Length: 774, dtype: int64

Value counts for pgfree:
pgfree
0.00      4869
0.40       115
0.20        98
0.60        98
0.80        87
          ... 
73.65        1
131.00       1
96.81        1
62.12        1
13.03        1
Name: count, Length: 1070, dtype: int64

Value counts for pgscan:
pgscan
0.00      6448
0.60         9
1.20         7
2.40         7
27.00        7
          ... 
286.23       1
190.18       1
43.29        1
256.09       1
18.04        1
Name: count, Length: 1202, dtype: int64

Value counts for atch:
atch
0.00     4575
0.20      804
0.40      504
0.60      307
0.80      287
         ... 
98.61       1
41.80       1
18.84       1
2.58        1
8.82        1
Name: count, Length: 253, dtype: int64

Value counts for pgin:
pgin
0.00     1220
0.20      457
0.40      317
0.60      288
0.80      248
         ... 
53.31       1
69.58       1
72.20       1
19.28       1
35.87       1
Name: count, Length: 832, dtype: int64

Value counts for ppgin:
ppgin
0.00     1220
0.20      350
0.40      332
0.80      221
0.60      215
         ... 
79.04       1
71.40       1
16.23       1
84.03       1
47.90       1
Name: count, Length: 1072, dtype: int64

Value counts for pflt:
pflt
15.60     532
15.80     114
15.40     111
16.00      83
15.57      79
         ... 
300.00      1
144.71      1
254.60      1
76.65       1
93.19       1
Name: count, Length: 2987, dtype: int64

Value counts for vflt:
vflt
16.80     412
17.00      95
16.83      67
16.77      53
17.20      50
         ... 
482.57      1
525.00      1
497.01      1
27.05       1
270.74      1
Name: count, Length: 3799, dtype: int64

Value counts for runqsz:
runqsz
Not_CPU_Bound    4331
CPU_Bound        3861
Name: count, dtype: int64

Value counts for freemem:
freemem
132      37
159      31
168      29
136      28
139      28
         ..
874       1
11640     1
4728      1
6888      1
6210      1
Name: count, Length: 3165, dtype: int64

Value counts for freeswap:
freeswap
11         25
10         23
9          22
12         19
7          19
           ..
1745791     1
1064513     1
1092566     1
1381216     1
1756514     1
Name: count, Length: 7658, dtype: int64

Value counts for usr:
usr
90    459
91    448
92    426
94    421
93    411
97    410
96    410
95    405
88    384
98    378
89    376
87    338
0     283
86    283
85    254
84    252
83    230
81    201
82    187
80    166
79    150
77    144
78    126
76    119
75    104
74     96
72     77
73     73
99     60
69     51
71     49
68     46
70     42
67     39
66     36
63     32
64     27
62     27
65     25
59     23
60     20
58     17
61     16
57     14
56     11
1      10
55     10
54      7
53      5
50      4
51      4
52      2
49      1
48      1
2       1
46      1
Name: count, dtype: int64

In [13]:
# Select numerical columns
num_columns = df.select_dtypes(include=['int64', 'float64']).columns

# Select categorical columns
cat_columns = df.select_dtypes(include=['object', 'category']).columns
In [867]:
num_columns
Out[867]:
['lread',
 'lwrite',
 'scall',
 'sread',
 'swrite',
 'fork',
 'exec',
 'rchar',
 'wchar',
 'pgout',
 'ppgout',
 'pgfree',
 'pgscan',
 'atch',
 'pgin',
 'ppgin',
 'pflt',
 'vflt',
 'freemem',
 'freeswap',
 'usr']
In [868]:
cat_columns
Out[868]:
Index(['runqsz'], dtype='object')

Uni-variate Analysis¶

Categorical¶

In [14]:
fig, ax = plt.subplots(figsize=(15, 10))

# Loop through the selected columns and draw countplots on a single set of axes
for column in cat_columns:
    sns.countplot(x=column, data=df, ax=ax)
    ax.set_title(f'Countplot for {column}')

# Adjust layout to prevent overlapping titles
plt.tight_layout()

# Show the plot
plt.show()
No description has been provided for this image
In [15]:
# Loop through the selected columns
for column in cat_columns:
    count_info = df[column].value_counts()
    print(f'Count information for {column}:\n{count_info}\n{"="*30}\n')
Count information for runqsz:
runqsz
Not_CPU_Bound    4331
CPU_Bound        3861
Name: count, dtype: int64
==============================

Numerical¶

In [16]:
# Set up subplots dynamically
fig, axes = plt.subplots(nrows=21, ncols=1, figsize=(15, 100))

# Draw histplot for each numerical column
for i, column in enumerate(num_columns):
    sns.histplot(df[column], bins=20, kde=True, ax=axes[i])
    axes[i].set_title(f'Histogram for {column}')

# Adjust layout to prevent overlapping titles
plt.tight_layout()

# Show the plot
plt.show()
No description has been provided for this image
In [870]:
# Calculate and display histogram values for each numerical column
for column in num_columns:
    hist_values, bin_edges = np.histogram(df[column], bins=20)
    bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
    
    print(f'Histogram values for {column}:')
    for center, value in zip(bin_centers, hist_values):
        print(f'Bin Center: {center}, Count: {value}')
    print('\n')
Histogram values for lread:
Bin Center: 1.175, Count: 2457
Bin Center: 3.5250000000000004, Count: 947
Bin Center: 5.875, Count: 924
Bin Center: 8.225000000000001, Count: 409
Bin Center: 10.575, Count: 359
Bin Center: 12.925, Count: 471
Bin Center: 15.275, Count: 238
Bin Center: 17.625, Count: 218
Bin Center: 19.975, Count: 277
Bin Center: 22.325000000000003, Count: 146
Bin Center: 24.675, Count: 113
Bin Center: 27.025000000000002, Count: 164
Bin Center: 29.375, Count: 100
Bin Center: 31.725, Count: 86
Bin Center: 34.075, Count: 133
Bin Center: 36.425, Count: 83
Bin Center: 38.775000000000006, Count: 67
Bin Center: 41.125, Count: 97
Bin Center: 43.475, Count: 64
Bin Center: 45.825, Count: 839


Histogram values for lwrite:
Bin Center: 0.625, Count: 4213
Bin Center: 1.875, Count: 615
Bin Center: 3.125, Count: 284
Bin Center: 4.375, Count: 253
Bin Center: 5.625, Count: 372
Bin Center: 6.875, Count: 126
Bin Center: 8.125, Count: 113
Bin Center: 9.375, Count: 100
Bin Center: 10.625, Count: 182
Bin Center: 11.875, Count: 75
Bin Center: 13.125, Count: 100
Bin Center: 14.375, Count: 62
Bin Center: 15.625, Count: 94
Bin Center: 16.875, Count: 43
Bin Center: 18.125, Count: 43
Bin Center: 19.375, Count: 33
Bin Center: 20.625, Count: 63
Bin Center: 21.875, Count: 28
Bin Center: 23.125, Count: 29
Bin Center: 24.375, Count: 1364


Histogram values for scall:
Bin Center: 275.653125, Count: 1026
Bin Center: 608.9593749999999, Count: 596
Bin Center: 942.2656249999999, Count: 608
Bin Center: 1275.5718749999999, Count: 681
Bin Center: 1608.878125, Count: 643
Bin Center: 1942.1843749999998, Count: 657
Bin Center: 2275.4906249999995, Count: 672
Bin Center: 2608.796875, Count: 535
Bin Center: 2942.1031249999996, Count: 457
Bin Center: 3275.409375, Count: 425
Bin Center: 3608.715625, Count: 383
Bin Center: 3942.0218749999995, Count: 333
Bin Center: 4275.328125, Count: 293
Bin Center: 4608.634375, Count: 223
Bin Center: 4941.940624999999, Count: 153
Bin Center: 5275.246875, Count: 135
Bin Center: 5608.553124999999, Count: 95
Bin Center: 5941.859375, Count: 74
Bin Center: 6275.165625, Count: 59
Bin Center: 6608.471874999999, Count: 144


Histogram values for sread:
Bin Center: 20.0625, Count: 705
Bin Center: 48.1875, Count: 725
Bin Center: 76.3125, Count: 724
Bin Center: 104.4375, Count: 749
Bin Center: 132.5625, Count: 713
Bin Center: 160.6875, Count: 704
Bin Center: 188.8125, Count: 613
Bin Center: 216.9375, Count: 523
Bin Center: 245.0625, Count: 431
Bin Center: 273.1875, Count: 375
Bin Center: 301.3125, Count: 306
Bin Center: 329.4375, Count: 251
Bin Center: 357.5625, Count: 231
Bin Center: 385.6875, Count: 172
Bin Center: 413.8125, Count: 163
Bin Center: 441.9375, Count: 133
Bin Center: 470.0625, Count: 125
Bin Center: 498.1875, Count: 92
Bin Center: 526.3125, Count: 61
Bin Center: 554.4375, Count: 396


Histogram values for swrite:
Bin Center: 16.025, Count: 607
Bin Center: 34.075, Count: 693
Bin Center: 52.125, Count: 688
Bin Center: 70.17500000000001, Count: 708
Bin Center: 88.225, Count: 680
Bin Center: 106.275, Count: 656
Bin Center: 124.32500000000002, Count: 660
Bin Center: 142.375, Count: 599
Bin Center: 160.425, Count: 499
Bin Center: 178.47500000000002, Count: 400
Bin Center: 196.525, Count: 314
Bin Center: 214.57500000000002, Count: 245
Bin Center: 232.625, Count: 203
Bin Center: 250.675, Count: 184
Bin Center: 268.725, Count: 144
Bin Center: 286.775, Count: 91
Bin Center: 304.82500000000005, Count: 88
Bin Center: 322.875, Count: 91
Bin Center: 340.925, Count: 81
Bin Center: 358.975, Count: 561


Histogram values for fork:
Bin Center: 0.12250000000000001, Count: 2021
Bin Center: 0.36750000000000005, Count: 967
Bin Center: 0.6125, Count: 717
Bin Center: 0.8575000000000002, Count: 564
Bin Center: 1.1025, Count: 714
Bin Center: 1.3475000000000001, Count: 271
Bin Center: 1.5925000000000002, Count: 219
Bin Center: 1.8375000000000001, Count: 257
Bin Center: 2.0825, Count: 417
Bin Center: 2.3275, Count: 140
Bin Center: 2.5725000000000002, Count: 148
Bin Center: 2.8175000000000003, Count: 119
Bin Center: 3.0625000000000004, Count: 110
Bin Center: 3.3075, Count: 188
Bin Center: 3.5525, Count: 79
Bin Center: 3.7975000000000003, Count: 71
Bin Center: 4.0425, Count: 60
Bin Center: 4.2875, Count: 92
Bin Center: 4.532500000000001, Count: 54
Bin Center: 4.7775, Count: 984


Histogram values for exec:
Bin Center: 0.16749999999999998, Count: 2082
Bin Center: 0.5025, Count: 1169
Bin Center: 0.8374999999999999, Count: 802
Bin Center: 1.1724999999999999, Count: 291
Bin Center: 1.5074999999999998, Count: 472
Bin Center: 1.8424999999999998, Count: 642
Bin Center: 2.1774999999999998, Count: 282
Bin Center: 2.5124999999999997, Count: 368
Bin Center: 2.8474999999999997, Count: 288
Bin Center: 3.1824999999999997, Count: 142
Bin Center: 3.5174999999999996, Count: 216
Bin Center: 3.8524999999999996, Count: 182
Bin Center: 4.1875, Count: 68
Bin Center: 4.522499999999999, Count: 130
Bin Center: 4.8575, Count: 100
Bin Center: 5.192499999999999, Count: 43
Bin Center: 5.5275, Count: 69
Bin Center: 5.862499999999999, Count: 59
Bin Center: 6.1975, Count: 28
Bin Center: 6.532499999999999, Count: 759


Histogram values for rchar:
Bin Center: 15550.953125, Count: 1921
Bin Center: 46096.859375, Count: 807
Bin Center: 76642.765625, Count: 609
Bin Center: 107188.671875, Count: 659
Bin Center: 137734.578125, Count: 637
Bin Center: 168280.484375, Count: 477
Bin Center: 198826.390625, Count: 420
Bin Center: 229372.296875, Count: 370
Bin Center: 259918.203125, Count: 358
Bin Center: 290464.109375, Count: 285
Bin Center: 321010.015625, Count: 229
Bin Center: 351555.921875, Count: 208
Bin Center: 382101.828125, Count: 152
Bin Center: 412647.734375, Count: 131
Bin Center: 443193.640625, Count: 99
Bin Center: 473739.546875, Count: 95
Bin Center: 504285.453125, Count: 75
Bin Center: 534831.359375, Count: 82
Bin Center: 565377.265625, Count: 58
Bin Center: 595923.171875, Count: 520


Histogram values for wchar:
Bin Center: 7226.196875, Count: 951
Bin Center: 18682.590624999997, Count: 1264
Bin Center: 30138.984374999996, Count: 1163
Bin Center: 41595.378124999996, Count: 779
Bin Center: 53051.771875, Count: 638
Bin Center: 64508.165624999994, Count: 456
Bin Center: 75964.55937499998, Count: 334
Bin Center: 87420.953125, Count: 293
Bin Center: 98877.34687499999, Count: 233
Bin Center: 110333.740625, Count: 217
Bin Center: 121790.134375, Count: 165
Bin Center: 133246.52812499998, Count: 149
Bin Center: 144702.921875, Count: 131
Bin Center: 156159.315625, Count: 108
Bin Center: 167615.70937499998, Count: 84
Bin Center: 179072.103125, Count: 93
Bin Center: 190528.49687499998, Count: 89
Bin Center: 201984.890625, Count: 79
Bin Center: 213441.284375, Count: 74
Bin Center: 224897.67812499998, Count: 892


Histogram values for pgout:
Bin Center: 0.15, Count: 5018
Bin Center: 0.44999999999999996, Count: 141
Bin Center: 0.75, Count: 262
Bin Center: 1.0499999999999998, Count: 113
Bin Center: 1.35, Count: 202
Bin Center: 1.65, Count: 109
Bin Center: 1.95, Count: 176
Bin Center: 2.25, Count: 74
Bin Center: 2.55, Count: 168
Bin Center: 2.8499999999999996, Count: 90
Bin Center: 3.15, Count: 119
Bin Center: 3.4499999999999997, Count: 66
Bin Center: 3.75, Count: 109
Bin Center: 4.05, Count: 99
Bin Center: 4.35, Count: 97
Bin Center: 4.65, Count: 49
Bin Center: 4.949999999999999, Count: 86
Bin Center: 5.25, Count: 74
Bin Center: 5.55, Count: 73
Bin Center: 5.85, Count: 1067


Histogram values for ppgout:
Bin Center: 0.2625, Count: 5093
Bin Center: 0.7875000000000001, Count: 283
Bin Center: 1.3125, Count: 163
Bin Center: 1.8375000000000001, Count: 191
Bin Center: 2.3625, Count: 126
Bin Center: 2.8875, Count: 73
Bin Center: 3.4125000000000005, Count: 115
Bin Center: 3.9375, Count: 78
Bin Center: 4.4625, Count: 111
Bin Center: 4.987500000000001, Count: 97
Bin Center: 5.5125, Count: 61
Bin Center: 6.0375000000000005, Count: 80
Bin Center: 6.5625, Count: 63
Bin Center: 7.0875, Count: 41
Bin Center: 7.612500000000001, Count: 77
Bin Center: 8.1375, Count: 52
Bin Center: 8.662500000000001, Count: 48
Bin Center: 9.1875, Count: 44
Bin Center: 9.7125, Count: 35
Bin Center: 10.2375, Count: 1361


Histogram values for pgfree:
Bin Center: 0.3125, Count: 5181
Bin Center: 0.9375, Count: 237
Bin Center: 1.5625, Count: 191
Bin Center: 2.1875, Count: 134
Bin Center: 2.8125, Count: 109
Bin Center: 3.4375, Count: 93
Bin Center: 4.0625, Count: 98
Bin Center: 4.6875, Count: 89
Bin Center: 5.3125, Count: 82
Bin Center: 5.9375, Count: 61
Bin Center: 6.5625, Count: 42
Bin Center: 7.1875, Count: 37
Bin Center: 7.8125, Count: 46
Bin Center: 8.4375, Count: 39
Bin Center: 9.0625, Count: 37
Bin Center: 9.6875, Count: 35
Bin Center: 10.3125, Count: 50
Bin Center: 10.9375, Count: 28
Bin Center: 11.5625, Count: 26
Bin Center: 12.1875, Count: 1577


Histogram values for pgscan:
Bin Center: -0.475, Count: 0
Bin Center: -0.42500000000000004, Count: 0
Bin Center: -0.375, Count: 0
Bin Center: -0.32499999999999996, Count: 0
Bin Center: -0.275, Count: 0
Bin Center: -0.22499999999999998, Count: 0
Bin Center: -0.17499999999999996, Count: 0
Bin Center: -0.12499999999999997, Count: 0
Bin Center: -0.07499999999999998, Count: 0
Bin Center: -0.024999999999999994, Count: 0
Bin Center: 0.025000000000000022, Count: 8192
Bin Center: 0.07500000000000007, Count: 0
Bin Center: 0.12500000000000006, Count: 0
Bin Center: 0.17500000000000004, Count: 0
Bin Center: 0.22500000000000003, Count: 0
Bin Center: 0.275, Count: 0
Bin Center: 0.32500000000000007, Count: 0
Bin Center: 0.37500000000000006, Count: 0
Bin Center: 0.42500000000000004, Count: 0
Bin Center: 0.47500000000000003, Count: 0


Histogram values for atch:
Bin Center: 0.0375, Count: 4575
Bin Center: 0.11249999999999999, Count: 0
Bin Center: 0.1875, Count: 804
Bin Center: 0.26249999999999996, Count: 0
Bin Center: 0.3375, Count: 0
Bin Center: 0.4125, Count: 504
Bin Center: 0.4875, Count: 0
Bin Center: 0.5625, Count: 0
Bin Center: 0.6375, Count: 307
Bin Center: 0.7124999999999999, Count: 0
Bin Center: 0.7875, Count: 289
Bin Center: 0.8624999999999999, Count: 0
Bin Center: 0.9375, Count: 0
Bin Center: 1.0125, Count: 203
Bin Center: 1.0875, Count: 0
Bin Center: 1.1625, Count: 3
Bin Center: 1.2374999999999998, Count: 171
Bin Center: 1.3125, Count: 0
Bin Center: 1.3875, Count: 127
Bin Center: 1.4625, Count: 1209


Histogram values for pgin:
Bin Center: 0.5878125000000001, Count: 2766
Bin Center: 1.7634375000000002, Count: 1014
Bin Center: 2.9390625000000004, Count: 668
Bin Center: 4.1146875000000005, Count: 480
Bin Center: 5.290312500000001, Count: 343
Bin Center: 6.465937500000001, Count: 312
Bin Center: 7.641562500000001, Count: 302
Bin Center: 8.817187500000001, Count: 224
Bin Center: 9.992812500000001, Count: 168
Bin Center: 11.168437500000001, Count: 181
Bin Center: 12.344062500000001, Count: 164
Bin Center: 13.519687500000002, Count: 128
Bin Center: 14.695312500000002, Count: 125
Bin Center: 15.870937500000002, Count: 103
Bin Center: 17.0465625, Count: 87
Bin Center: 18.222187500000004, Count: 76
Bin Center: 19.3978125, Count: 68
Bin Center: 20.573437500000004, Count: 80
Bin Center: 21.7490625, Count: 55
Bin Center: 22.924687500000005, Count: 848


Histogram values for ppgin:
Bin Center: 0.8400000000000001, Count: 3029
Bin Center: 2.5200000000000005, Count: 860
Bin Center: 4.200000000000001, Count: 627
Bin Center: 5.880000000000001, Count: 433
Bin Center: 7.5600000000000005, Count: 327
Bin Center: 9.240000000000002, Count: 330
Bin Center: 10.920000000000002, Count: 245
Bin Center: 12.600000000000001, Count: 249
Bin Center: 14.280000000000001, Count: 174
Bin Center: 15.96, Count: 151
Bin Center: 17.64, Count: 157
Bin Center: 19.32, Count: 110
Bin Center: 21.000000000000004, Count: 118
Bin Center: 22.680000000000003, Count: 105
Bin Center: 24.360000000000003, Count: 108
Bin Center: 26.040000000000003, Count: 84
Bin Center: 27.720000000000002, Count: 74
Bin Center: 29.400000000000002, Count: 68
Bin Center: 31.080000000000002, Count: 61
Bin Center: 32.760000000000005, Count: 882


Histogram values for pflt:
Bin Center: 9.0375, Count: 1557
Bin Center: 27.112499999999997, Count: 1224
Bin Center: 45.1875, Count: 942
Bin Center: 63.262499999999996, Count: 665
Bin Center: 81.3375, Count: 496
Bin Center: 99.4125, Count: 426
Bin Center: 117.48749999999998, Count: 315
Bin Center: 135.5625, Count: 306
Bin Center: 153.6375, Count: 281
Bin Center: 171.71249999999998, Count: 249
Bin Center: 189.7875, Count: 224
Bin Center: 207.86249999999998, Count: 208
Bin Center: 225.9375, Count: 186
Bin Center: 244.0125, Count: 179
Bin Center: 262.0875, Count: 121
Bin Center: 280.1625, Count: 92
Bin Center: 298.23749999999995, Count: 94
Bin Center: 316.3125, Count: 80
Bin Center: 334.3875, Count: 74
Bin Center: 352.4625, Count: 473


Histogram values for vflt:
Bin Center: 14.23, Count: 1422
Bin Center: 42.290000000000006, Count: 986
Bin Center: 70.35000000000001, Count: 844
Bin Center: 98.41000000000001, Count: 678
Bin Center: 126.47, Count: 574
Bin Center: 154.53, Count: 524
Bin Center: 182.59, Count: 448
Bin Center: 210.65, Count: 368
Bin Center: 238.71, Count: 310
Bin Center: 266.77, Count: 228
Bin Center: 294.83000000000004, Count: 216
Bin Center: 322.89, Count: 173
Bin Center: 350.95000000000005, Count: 151
Bin Center: 379.01, Count: 165
Bin Center: 407.07000000000005, Count: 124
Bin Center: 435.13, Count: 131
Bin Center: 463.19000000000005, Count: 109
Bin Center: 491.25, Count: 102
Bin Center: 519.3100000000001, Count: 86
Bin Center: 547.3700000000001, Count: 553


Histogram values for freemem:
Bin Center: 170.103125, Count: 2557
Bin Center: 400.30937500000005, Count: 1283
Bin Center: 630.515625, Count: 850
Bin Center: 860.7218750000001, Count: 436
Bin Center: 1090.928125, Count: 368
Bin Center: 1321.134375, Count: 260
Bin Center: 1551.3406250000003, Count: 169
Bin Center: 1781.546875, Count: 151
Bin Center: 2011.7531250000002, Count: 161
Bin Center: 2241.9593750000004, Count: 77
Bin Center: 2472.165625, Count: 134
Bin Center: 2702.3718750000003, Count: 108
Bin Center: 2932.578125, Count: 114
Bin Center: 3162.784375, Count: 67
Bin Center: 3392.9906250000004, Count: 53
Bin Center: 3623.196875, Count: 64
Bin Center: 3853.4031250000003, Count: 43
Bin Center: 4083.6093750000005, Count: 56
Bin Center: 4313.815625, Count: 14
Bin Center: 4544.021875, Count: 1227


Histogram values for freeswap:
Bin Center: 66794.4375, Count: 294
Bin Center: 178404.3125, Count: 0
Bin Center: 290014.1875, Count: 0
Bin Center: 401624.0625, Count: 0
Bin Center: 513233.9375, Count: 0
Bin Center: 624843.8125, Count: 0
Bin Center: 736453.6875, Count: 0
Bin Center: 848063.5625, Count: 0
Bin Center: 959673.4375, Count: 1159
Bin Center: 1071283.3125, Count: 2521
Bin Center: 1182893.1875, Count: 117
Bin Center: 1294503.0625, Count: 285
Bin Center: 1406112.9375, Count: 305
Bin Center: 1517722.8125, Count: 538
Bin Center: 1629332.6875, Count: 325
Bin Center: 1740942.5625, Count: 1436
Bin Center: 1852552.4375, Count: 1208
Bin Center: 1964162.3125, Count: 1
Bin Center: 2075772.1875, Count: 1
Bin Center: 2187382.0625, Count: 2


Histogram values for usr:
Bin Center: 62.4375, Count: 489
Bin Center: 64.3125, Count: 52
Bin Center: 66.1875, Count: 75
Bin Center: 68.0625, Count: 46
Bin Center: 69.9375, Count: 93
Bin Center: 71.8125, Count: 126
Bin Center: 73.6875, Count: 169
Bin Center: 75.5625, Count: 223
Bin Center: 77.4375, Count: 270
Bin Center: 79.3125, Count: 316
Bin Center: 81.1875, Count: 388
Bin Center: 83.0625, Count: 230
Bin Center: 84.9375, Count: 506
Bin Center: 86.8125, Count: 621
Bin Center: 88.6875, Count: 760
Bin Center: 90.5625, Count: 907
Bin Center: 92.4375, Count: 837
Bin Center: 94.3125, Count: 826
Bin Center: 96.1875, Count: 820
Bin Center: 98.0625, Count: 438


In [17]:
# Set up subplot for boxplot
fig, axes = plt.subplots(figsize=(20,16))

# Draw boxplot for all numerical columns
sns.boxplot(data=df[num_columns], ax=axes)
axes.set_title('Boxplot for Numerical Columns')

# Show the boxplot
plt.show()
No description has been provided for this image
In [ ]:
 
In [18]:
# Create a boxplot using Plotly Express
fig = px.box(df, y=num_columns, title='Boxplot for Numerical Columns')
fig.update_layout(height=600, width=1000)
fig.update_xaxes(tickangle=90)
# Show the plot
fig.show()

Bivariate Analysis¶

Numeric Vs Numeric¶

In [ ]:
# Pairplot
sns.pairplot(df[num_columns])
plt.suptitle("Pairplot of Numerical Variables", y=1.02)
sns.set(rc={'figure.figsize':(15.7,20.27)})
plt.savefig("pair_NvN.png", format="png", bbox_inches='tight')
plt.show()

# Correlation Matrix
correlation_matrix = df[num_columns].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix")
sns.set(rc={'figure.figsize':(15.7,20.27)})
#plt.savefig("corr_NvN.png", format="png", bbox_inches='tight')
plt.show()

# Scatterplots
for i in range(len(num_columns)):
    for j in range(i+1, len(num_columns)):
        plt.scatter(df[num_columns[i]], df[num_columns[j]])
        plt.xlabel(num_columns[i])
        plt.ylabel(num_columns[j])
        plt.title(f'Scatterplot: {num_columns[i]} vs {num_columns[j]}')
        #plt.savefig(f"scatter_NvN_{num_columns[i]}_vs_{num_columns[j]}.png", format="png", bbox_inches='tight')
        plt.show()


# Jointplots
#for i in range(len(num_columns)):
#    for j in range(i+1, len(num_columns)):
#        sns.jointplot(x=num_columns[i], y=num_columns[j], data=df, kind='scatter')
#        fig.suptitle(f'Jointplot: {num_columns[i]} vs {num_columns[j]}', y=1.02)
        #plt.savefig(f"joint_NvN_{num_columns[i]}_vs_{num_columns[j]}.png", format="png", bbox_inches='tight')
#        plt.show()
        
In [ ]:
# Output for the above cell has been redacted due large file size
In [20]:
# Convert correlation matrix to a long-form DataFrame for sorting
correlation_df = correlation_matrix.unstack().reset_index(name='Correlation')
sorted_correlation_df = correlation_df.sort_values(by='Correlation', ascending=False)

# Print the sorted correlation values
print("Sorted Correlation Matrix (Descending Order):")
for idx, row in sorted_correlation_df.iterrows():
    print(f"{row['level_0']} vs {row['level_1']}: {row['Correlation']:.2f}")
Sorted Correlation Matrix (Descending Order):
lread vs lread: 1.00
pgfree vs pgfree: 1.00
sread vs sread: 1.00
swrite vs swrite: 1.00
fork vs fork: 1.00
exec vs exec: 1.00
rchar vs rchar: 1.00
wchar vs wchar: 1.00
pgout vs pgout: 1.00
pgscan vs pgscan: 1.00
lwrite vs lwrite: 1.00
atch vs atch: 1.00
pgin vs pgin: 1.00
ppgin vs ppgin: 1.00
pflt vs pflt: 1.00
vflt vs vflt: 1.00
freemem vs freemem: 1.00
freeswap vs freeswap: 1.00
scall vs scall: 1.00
ppgout vs ppgout: 1.00
usr vs usr: 1.00
fork vs vflt: 0.94
vflt vs fork: 0.94
pflt vs vflt: 0.94
vflt vs pflt: 0.94
pflt vs fork: 0.93
fork vs pflt: 0.93
ppgin vs pgin: 0.92
pgin vs ppgin: 0.92
pgfree vs ppgout: 0.92
ppgout vs pgfree: 0.92
pgscan vs pgfree: 0.92
pgfree vs pgscan: 0.92
swrite vs sread: 0.88
sread vs swrite: 0.88
ppgout vs pgout: 0.87
pgout vs ppgout: 0.87
ppgout vs pgscan: 0.79
pgscan vs ppgout: 0.79
fork vs exec: 0.76
exec vs fork: 0.76
pgout vs pgfree: 0.73
pgfree vs pgout: 0.73
sread vs scall: 0.70
scall vs sread: 0.70
vflt vs exec: 0.69
exec vs vflt: 0.69
freeswap vs usr: 0.68
usr vs freeswap: 0.68
pflt vs exec: 0.65
exec vs pflt: 0.65
swrite vs scall: 0.62
scall vs swrite: 0.62
pgfree vs ppgin: 0.59
ppgin vs pgfree: 0.59
freemem vs freeswap: 0.57
freeswap vs freemem: 0.57
pgscan vs ppgin: 0.56
ppgin vs pgscan: 0.56
pgscan vs pgout: 0.55
pgout vs pgscan: 0.55
ppgout vs ppgin: 0.54
ppgin vs ppgout: 0.54
lread vs lwrite: 0.53
lwrite vs lread: 0.53
pgin vs pgfree: 0.53
pgfree vs pgin: 0.53
vflt vs scall: 0.53
scall vs vflt: 0.53
rchar vs wchar: 0.50
wchar vs rchar: 0.50
sread vs rchar: 0.50
rchar vs sread: 0.50
pgscan vs pgin: 0.50
pgin vs pgscan: 0.50
vflt vs sread: 0.49
sread vs vflt: 0.49
ppgout vs pgin: 0.49
pgin vs ppgout: 0.49
pflt vs scall: 0.48
scall vs pflt: 0.48
sread vs pflt: 0.45
pflt vs sread: 0.45
fork vs scall: 0.45
scall vs fork: 0.45
fork vs sread: 0.42
sread vs fork: 0.42
vflt vs swrite: 0.42
swrite vs vflt: 0.42
ppgin vs pgout: 0.41
pgout vs ppgin: 0.41
wchar vs sread: 0.40
sread vs wchar: 0.40
swrite vs pflt: 0.40
pflt vs swrite: 0.40
wchar vs swrite: 0.39
swrite vs wchar: 0.39
pgin vs pgout: 0.39
pgout vs pgin: 0.39
swrite vs fork: 0.38
fork vs swrite: 0.38
rchar vs vflt: 0.36
vflt vs rchar: 0.36
rchar vs scall: 0.35
scall vs rchar: 0.35
rchar vs ppgin: 0.35
ppgin vs rchar: 0.35
swrite vs rchar: 0.33
rchar vs swrite: 0.33
pflt vs rchar: 0.31
rchar vs pflt: 0.31
scall vs exec: 0.31
exec vs scall: 0.31
vflt vs pgin: 0.30
pgin vs vflt: 0.30
pgfree vs vflt: 0.30
vflt vs pgfree: 0.30
rchar vs pgin: 0.30
pgin vs rchar: 0.30
vflt vs ppgout: 0.29
ppgout vs vflt: 0.29
pgscan vs vflt: 0.28
vflt vs pgscan: 0.28
fork vs rchar: 0.28
rchar vs fork: 0.28
pgfree vs rchar: 0.28
rchar vs pgfree: 0.28
wchar vs scall: 0.27
scall vs wchar: 0.27
usr vs freemem: 0.27
freemem vs usr: 0.27
rchar vs ppgout: 0.27
ppgout vs rchar: 0.27
vflt vs ppgin: 0.26
ppgin vs vflt: 0.26
pgscan vs rchar: 0.26
rchar vs pgscan: 0.26
scall vs pgin: 0.24
pgin vs scall: 0.24
vflt vs pgout: 0.23
pgout vs vflt: 0.23
sread vs ppgout: 0.23
ppgout vs sread: 0.23
ppgin vs scall: 0.22
scall vs ppgin: 0.22
sread vs pgfree: 0.21
pgfree vs sread: 0.21
rchar vs pgout: 0.21
pgout vs rchar: 0.21
sread vs ppgin: 0.21
ppgin vs sread: 0.21
scall vs ppgout: 0.21
ppgout vs scall: 0.21
sread vs pgin: 0.21
pgin vs sread: 0.21
wchar vs ppgin: 0.20
ppgin vs wchar: 0.20
scall vs pgfree: 0.20
pgfree vs scall: 0.20
scall vs pgout: 0.19
pgout vs scall: 0.19
pgscan vs sread: 0.19
sread vs pgscan: 0.19
sread vs pgout: 0.19
pgout vs sread: 0.19
wchar vs pgout: 0.19
pgout vs wchar: 0.19
scall vs lread: 0.19
lread vs scall: 0.19
pgfree vs pflt: 0.19
pflt vs pgfree: 0.19
lread vs pgin: 0.19
pgin vs lread: 0.19
wchar vs ppgout: 0.19
ppgout vs wchar: 0.19
exec vs pgin: 0.19
pgin vs exec: 0.19
ppgout vs pflt: 0.19
pflt vs ppgout: 0.19
wchar vs atch: 0.18
atch vs wchar: 0.18
pgscan vs pflt: 0.18
pflt vs pgscan: 0.18
pgin vs wchar: 0.18
wchar vs pgin: 0.18
scall vs pgscan: 0.18
pgscan vs scall: 0.18
pgin vs pflt: 0.18
pflt vs pgin: 0.18
atch vs rchar: 0.17
rchar vs atch: 0.17
exec vs rchar: 0.17
rchar vs exec: 0.17
fork vs pgfree: 0.17
pgfree vs fork: 0.17
ppgout vs fork: 0.17
fork vs ppgout: 0.17
vflt vs lread: 0.17
lread vs vflt: 0.17
exec vs sread: 0.16
sread vs exec: 0.16
fork vs pgin: 0.16
pgin vs fork: 0.16
lread vs ppgin: 0.16
ppgin vs lread: 0.16
fork vs pgscan: 0.16
pgscan vs fork: 0.16
swrite vs ppgout: 0.16
ppgout vs swrite: 0.16
wchar vs pgfree: 0.16
pgfree vs wchar: 0.16
pgout vs swrite: 0.15
swrite vs pgout: 0.15
pgout vs pflt: 0.15
pflt vs pgout: 0.15
ppgin vs pflt: 0.15
pflt vs ppgin: 0.15
ppgin vs exec: 0.15
exec vs ppgin: 0.15
exec vs ppgout: 0.15
ppgout vs exec: 0.15
pgout vs atch: 0.15
atch vs pgout: 0.15
swrite vs pgin: 0.15
pgin vs swrite: 0.15
exec vs pgfree: 0.15
pgfree vs exec: 0.15
pgfree vs swrite: 0.15
swrite vs pgfree: 0.15
exec vs pgscan: 0.14
pgscan vs exec: 0.14
swrite vs ppgin: 0.14
ppgin vs swrite: 0.14
lwrite vs scall: 0.14
scall vs lwrite: 0.14
fork vs lread: 0.14
lread vs fork: 0.14
lread vs pflt: 0.14
pflt vs lread: 0.14
sread vs lread: 0.13
lread vs sread: 0.13
ppgin vs fork: 0.13
fork vs ppgin: 0.13
lread vs ppgout: 0.13
ppgout vs lread: 0.13
fork vs pgout: 0.13
pgout vs fork: 0.13
sread vs lwrite: 0.13
lwrite vs sread: 0.13
swrite vs pgscan: 0.12
pgscan vs swrite: 0.12
swrite vs lread: 0.12
lread vs swrite: 0.12
rchar vs lwrite: 0.12
lwrite vs rchar: 0.12
lread vs pgfree: 0.11
pgfree vs lread: 0.11
wchar vs pgscan: 0.11
pgscan vs wchar: 0.11
exec vs pgout: 0.11
pgout vs exec: 0.11
vflt vs wchar: 0.11
wchar vs vflt: 0.11
exec vs lread: 0.11
lread vs exec: 0.11
lread vs rchar: 0.11
rchar vs lread: 0.11
swrite vs exec: 0.10
exec vs swrite: 0.10
swrite vs lwrite: 0.10
lwrite vs swrite: 0.10
atch vs vflt: 0.10
vflt vs atch: 0.10
vflt vs lwrite: 0.09
lwrite vs vflt: 0.09
ppgout vs atch: 0.09
atch vs ppgout: 0.09
lwrite vs wchar: 0.09
wchar vs lwrite: 0.09
lwrite vs pgin: 0.09
pgin vs lwrite: 0.09
lwrite vs ppgin: 0.09
ppgin vs lwrite: 0.09
lread vs pgscan: 0.09
pgscan vs lread: 0.09
wchar vs pflt: 0.09
pflt vs wchar: 0.09
atch vs sread: 0.09
sread vs atch: 0.09
pgout vs lread: 0.08
lread vs pgout: 0.08
wchar vs lread: 0.08
lread vs wchar: 0.08
ppgout vs lwrite: 0.08
lwrite vs ppgout: 0.08
scall vs atch: 0.08
atch vs scall: 0.08
pgfree vs atch: 0.07
atch vs pgfree: 0.07
lwrite vs pflt: 0.07
pflt vs lwrite: 0.07
lwrite vs pgout: 0.07
pgout vs lwrite: 0.07
lwrite vs pgfree: 0.07
pgfree vs lwrite: 0.07
atch vs swrite: 0.06
swrite vs atch: 0.06
fork vs wchar: 0.06
wchar vs fork: 0.06
pgin vs atch: 0.06
atch vs pgin: 0.06
atch vs ppgin: 0.06
ppgin vs atch: 0.06
fork vs lwrite: 0.05
lwrite vs fork: 0.05
atch vs exec: 0.05
exec vs atch: 0.05
pflt vs atch: 0.05
atch vs pflt: 0.05
atch vs fork: 0.05
fork vs atch: 0.05
pgscan vs lwrite: 0.04
lwrite vs pgscan: 0.04
pgscan vs atch: 0.04
atch vs pgscan: 0.04
exec vs lwrite: 0.04
lwrite vs exec: 0.04
atch vs lwrite: 0.03
lwrite vs atch: 0.03
atch vs lread: 0.02
lread vs atch: 0.02
wchar vs exec: 0.00
exec vs wchar: 0.00
lread vs freeswap: -0.08
freeswap vs lread: -0.08
freemem vs lread: -0.08
lread vs freemem: -0.08
atch vs freemem: -0.09
freemem vs atch: -0.09
freemem vs lwrite: -0.09
lwrite vs freemem: -0.09
usr vs lwrite: -0.11
lwrite vs usr: -0.11
pflt vs freemem: -0.11
freemem vs pflt: -0.11
freeswap vs lwrite: -0.12
lwrite vs freeswap: -0.12
freeswap vs atch: -0.12
atch vs freeswap: -0.12
fork vs freemem: -0.12
freemem vs fork: -0.12
usr vs atch: -0.13
atch vs usr: -0.13
freeswap vs fork: -0.13
fork vs freeswap: -0.13
pflt vs freeswap: -0.13
freeswap vs pflt: -0.13
usr vs lread: -0.14
lread vs usr: -0.14
freemem vs wchar: -0.15
wchar vs freemem: -0.15
freemem vs rchar: -0.15
rchar vs freemem: -0.15
freeswap vs exec: -0.15
exec vs freeswap: -0.15
freemem vs exec: -0.16
exec vs freemem: -0.16
pgscan vs freeswap: -0.18
freeswap vs pgscan: -0.18
pgscan vs usr: -0.18
usr vs pgscan: -0.18
freemem vs pgscan: -0.19
pgscan vs freemem: -0.19
freemem vs vflt: -0.20
vflt vs freemem: -0.20
freeswap vs pgfree: -0.21
pgfree vs freeswap: -0.21
usr vs ppgout: -0.21
ppgout vs usr: -0.21
ppgout vs freeswap: -0.21
freeswap vs ppgout: -0.21
ppgin vs freemem: -0.22
freemem vs ppgin: -0.22
usr vs pgfree: -0.22
pgfree vs usr: -0.22
rchar vs freeswap: -0.22
freeswap vs rchar: -0.22
usr vs pgout: -0.22
pgout vs usr: -0.22
wchar vs freeswap: -0.23
freeswap vs wchar: -0.23
freemem vs pgin: -0.23
pgin vs freemem: -0.23
usr vs ppgin: -0.23
ppgin vs usr: -0.23
freemem vs pgfree: -0.23
pgfree vs freemem: -0.23
freeswap vs swrite: -0.24
swrite vs freeswap: -0.24
usr vs pgin: -0.24
pgin vs usr: -0.24
freeswap vs pgout: -0.25
pgout vs freeswap: -0.25
vflt vs freeswap: -0.25
freeswap vs vflt: -0.25
freemem vs ppgout: -0.25
ppgout vs freemem: -0.25
swrite vs freemem: -0.25
freemem vs swrite: -0.25
freeswap vs ppgin: -0.25
ppgin vs freeswap: -0.25
freemem vs pgout: -0.27
pgout vs freemem: -0.27
usr vs swrite: -0.27
swrite vs usr: -0.27
pgin vs freeswap: -0.28
freeswap vs pgin: -0.28
freemem vs sread: -0.29
sread vs freemem: -0.29
usr vs exec: -0.29
exec vs usr: -0.29
wchar vs usr: -0.29
usr vs wchar: -0.29
sread vs freeswap: -0.30
freeswap vs sread: -0.30
usr vs scall: -0.32
scall vs usr: -0.32
usr vs rchar: -0.33
rchar vs usr: -0.33
usr vs sread: -0.33
sread vs usr: -0.33
freeswap vs scall: -0.35
scall vs freeswap: -0.35
usr vs fork: -0.36
fork vs usr: -0.36
usr vs pflt: -0.37
pflt vs usr: -0.37
freemem vs scall: -0.39
scall vs freemem: -0.39
usr vs vflt: -0.42
vflt vs usr: -0.42
In [874]:
# Convert correlation matrix to a long-form DataFrame for sorting
mask = np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool)
correlation_df = correlation_matrix.where(mask)
correlation_df = correlation_df.unstack().reset_index(name='Correlation').dropna()

# Sort the correlation DataFrame
sorted_correlation_df = correlation_df.sort_values(by='Correlation', ascending=False)

# Print the sorted correlation values
print("Sorted Correlation Matrix (Descending Order):")
for idx, row in sorted_correlation_df.iterrows():
    print(f"{row['level_0']} vs {row['level_1']}: {row['Correlation']:.2f}")
Sorted Correlation Matrix (Descending Order):
pgfree vs ppgout: 0.97
ppgin vs pgin: 0.96
ppgout vs pgout: 0.95
pflt vs fork: 0.94
vflt vs fork: 0.93
vflt vs pflt: 0.93
pgfree vs pgout: 0.91
swrite vs sread: 0.88
lwrite vs lread: 0.83
exec vs fork: 0.77
vflt vs exec: 0.76
sread vs scall: 0.76
pflt vs exec: 0.76
swrite vs scall: 0.74
atch vs pgout: 0.64
atch vs ppgout: 0.61
freeswap vs freemem: 0.61
atch vs pgfree: 0.60
vflt vs sread: 0.60
rchar vs sread: 0.58
usr vs freeswap: 0.56
vflt vs swrite: 0.56
vflt vs scall: 0.55
pflt vs sread: 0.53
fork vs sread: 0.53
fork vs swrite: 0.52
pflt vs swrite: 0.51
wchar vs rchar: 0.49
pflt vs scall: 0.49
ppgin vs pgfree: 0.48
ppgin vs ppgout: 0.48
fork vs scall: 0.47
pgin vs pgfree: 0.46
pgin vs ppgout: 0.46
ppgin vs pgout: 0.45
exec vs scall: 0.44
vflt vs rchar: 0.44
pgin vs pgout: 0.44
wchar vs swrite: 0.43
vflt vs lread: 0.42
rchar vs swrite: 0.42
wchar vs sread: 0.42
vflt vs pgin: 0.40
ppgin vs rchar: 0.39
rchar vs scall: 0.39
vflt vs ppgin: 0.38
pflt vs rchar: 0.38
usr vs freemem: 0.38
pflt vs lread: 0.38
rchar vs fork: 0.37
exec vs sread: 0.37
pgin vs rchar: 0.37
fork vs lread: 0.37
exec vs lread: 0.36
pgin vs sread: 0.35
ppgin vs sread: 0.34
pgin vs scall: 0.34
ppgin vs atch: 0.33
scall vs lread: 0.33
wchar vs scall: 0.33
pgin vs atch: 0.33
sread vs lread: 0.33
ppgin vs scall: 0.33
rchar vs exec: 0.32
vflt vs pgfree: 0.32
vflt vs ppgout: 0.32
ppgout vs sread: 0.32
exec vs swrite: 0.31
pgfree vs sread: 0.31
pgin vs swrite: 0.31
swrite vs lread: 0.31
atch vs scall: 0.31
ppgout vs scall: 0.31
ppgin vs swrite: 0.30
pgout vs sread: 0.30
pgfree vs scall: 0.30
vflt vs pgout: 0.30
pgin vs exec: 0.30
pgout vs scall: 0.30
vflt vs atch: 0.30
atch vs sread: 0.29
ppgin vs lread: 0.29
ppgin vs exec: 0.29
ppgout vs swrite: 0.28
pgin vs lread: 0.28
pgfree vs swrite: 0.28
pgout vs swrite: 0.27
ppgout vs rchar: 0.27
atch vs rchar: 0.27
atch vs swrite: 0.26
pgfree vs rchar: 0.26
ppgin vs wchar: 0.26
rchar vs lread: 0.26
atch vs exec: 0.25
pflt vs pgin: 0.25
pgout vs rchar: 0.25
pgfree vs exec: 0.25
pgin vs wchar: 0.25
ppgout vs exec: 0.25
pgin vs fork: 0.25
pflt vs ppgin: 0.24
ppgin vs fork: 0.24
pgout vs exec: 0.23
atch vs lread: 0.23
pflt vs ppgout: 0.22
pflt vs pgfree: 0.22
ppgout vs lread: 0.22
pgfree vs fork: 0.22
pgfree vs lread: 0.21
ppgout vs fork: 0.21
pflt vs atch: 0.21
pgout vs lread: 0.21
pflt vs pgout: 0.21
ppgout vs wchar: 0.20
atch vs fork: 0.20
pgout vs fork: 0.20
pgout vs wchar: 0.20
pgfree vs wchar: 0.19
wchar vs lread: 0.17
atch vs wchar: 0.16
vflt vs wchar: 0.16
sread vs lwrite: 0.15
scall vs lwrite: 0.14
vflt vs lwrite: 0.14
wchar vs lwrite: 0.13
swrite vs lwrite: 0.13
atch vs lwrite: 0.13
pflt vs wchar: 0.13
wchar vs exec: 0.12
wchar vs fork: 0.12
exec vs lwrite: 0.12
ppgin vs lwrite: 0.12
rchar vs lwrite: 0.12
pgin vs lwrite: 0.11
pflt vs lwrite: 0.10
fork vs lwrite: 0.09
pgout vs lwrite: 0.09
ppgout vs lwrite: 0.09
pgfree vs lwrite: 0.08
freemem vs lwrite: -0.10
freeswap vs pflt: -0.13
freemem vs pflt: -0.13
freeswap vs fork: -0.13
freemem vs fork: -0.14
freemem vs wchar: -0.15
freeswap vs lwrite: -0.15
freemem vs rchar: -0.17
freeswap vs wchar: -0.18
freeswap vs exec: -0.18
usr vs lwrite: -0.19
freemem vs exec: -0.19
freemem vs lread: -0.20
freemem vs vflt: -0.23
freeswap vs rchar: -0.23
freeswap vs lread: -0.24
freeswap vs vflt: -0.26
freemem vs ppgin: -0.30
freemem vs pgin: -0.31
usr vs wchar: -0.32
freeswap vs swrite: -0.34
freeswap vs ppgout: -0.34
freeswap vs pgfree: -0.34
usr vs atch: -0.34
freeswap vs atch: -0.35
freeswap vs pgout: -0.35
freeswap vs ppgin: -0.35
freemem vs sread: -0.35
freemem vs swrite: -0.35
freeswap vs scall: -0.36
freeswap vs pgin: -0.37
freeswap vs sread: -0.37
usr vs pgout: -0.38
usr vs pgfree: -0.38
usr vs ppgout: -0.39
freemem vs scall: -0.39
usr vs lread: -0.44
freemem vs atch: -0.44
usr vs ppgin: -0.45
usr vs pgin: -0.46
freemem vs ppgout: -0.46
freemem vs pgfree: -0.46
freemem vs pgout: -0.47
usr vs rchar: -0.51
usr vs swrite: -0.60
usr vs exec: -0.61
usr vs scall: -0.62
usr vs sread: -0.64
usr vs fork: -0.67
usr vs pflt: -0.70
usr vs vflt: -0.75

Categorical to Numerical¶

In [21]:
# Set up subplots
fig, axes = plt.subplots(nrows=len(num_columns), ncols=len(cat_columns), figsize=(15, 100))

# Flatten the axes array
axes = axes.flatten()

# Iterate through numerical and categorical columns for boxplots
for i, num_col in enumerate(num_columns):
    for j, cat_col in enumerate(cat_columns):
        # Create boxplot for the current combination
        sns.boxplot(x=cat_col, y=num_col, data=df, ax=axes[i * len(cat_columns) + j])
        axes[i * len(cat_columns) + j].set_title(f'{num_col} vs {cat_col}')

# Adjust layout
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
# Iterate through numerical and categorical columns for statistical values
for num_col in num_columns:
    for cat_col in cat_columns:
        # Extract statistical values for the current combination
        stats = df.groupby(cat_col)[num_col].describe()
        
        # Print the statistics
        print(f"\nStatistics for {num_col} vs {cat_col}:\n{stats}")
In [22]:
# Convert categorical columns to numerical using one-hot encoding
df_encoded = pd.get_dummies(df, columns=cat_columns)

# Exclude numerical columns from encoded categorical columns
cat_columns_encoded = [col for col in df_encoded.columns if col not in num_columns]
num_columns = [col for col in df_encoded.columns if col not in cat_columns_encoded]

# Combine numerical and encoded categorical columns
all_columns = num_columns + cat_columns_encoded

# Compute the correlation matrix
corr_matrix = df_encoded[all_columns].corr()

# Set up the matplotlib figure
plt.figure(figsize=(30, 30))

# Create a heatmap for the correlation matrix
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")

# Set the title
plt.title('Correlation Heatmap for Numerical and Categorical Variables')

# Show the plot
plt.show()
No description has been provided for this image
In [23]:
# Flatten the upper triangle of the correlation matrix
corr_values = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)).stack().sort_values(ascending=False)

# Display correlations in descending order
print("Correlations in Descending Order:")
for index, value in corr_values.items():
    print(f"{index[0]} vs {index[1]}: {value:.2f}")
Correlations in Descending Order:
fork vs vflt: 0.94
pflt vs vflt: 0.94
fork vs pflt: 0.93
pgin vs ppgin: 0.92
ppgout vs pgfree: 0.92
pgfree vs pgscan: 0.92
sread vs swrite: 0.88
pgout vs ppgout: 0.87
ppgout vs pgscan: 0.79
fork vs exec: 0.76
pgout vs pgfree: 0.73
scall vs sread: 0.70
exec vs vflt: 0.69
freeswap vs usr: 0.68
exec vs pflt: 0.65
scall vs swrite: 0.62
pgfree vs ppgin: 0.59
freemem vs freeswap: 0.57
pgscan vs ppgin: 0.56
pgout vs pgscan: 0.55
ppgout vs ppgin: 0.54
lread vs lwrite: 0.53
pgfree vs pgin: 0.53
scall vs vflt: 0.53
rchar vs wchar: 0.50
sread vs rchar: 0.50
pgscan vs pgin: 0.50
sread vs vflt: 0.49
ppgout vs pgin: 0.49
scall vs pflt: 0.48
sread vs pflt: 0.45
scall vs fork: 0.45
sread vs fork: 0.42
swrite vs vflt: 0.42
pgout vs ppgin: 0.41
sread vs wchar: 0.40
swrite vs pflt: 0.40
swrite vs wchar: 0.39
pgout vs pgin: 0.39
swrite vs fork: 0.38
rchar vs vflt: 0.36
scall vs rchar: 0.35
rchar vs ppgin: 0.35
swrite vs rchar: 0.33
rchar vs pflt: 0.31
scall vs exec: 0.31
pgin vs vflt: 0.30
pgfree vs vflt: 0.30
rchar vs pgin: 0.30
ppgout vs vflt: 0.29
pgscan vs vflt: 0.28
fork vs rchar: 0.28
rchar vs pgfree: 0.28
scall vs wchar: 0.27
freemem vs usr: 0.27
rchar vs ppgout: 0.27
ppgin vs vflt: 0.26
usr vs runqsz_Not_CPU_Bound: 0.26
rchar vs pgscan: 0.26
scall vs runqsz_CPU_Bound: 0.24
scall vs pgin: 0.24
pgout vs vflt: 0.23
sread vs ppgout: 0.23
scall vs ppgin: 0.22
sread vs pgfree: 0.21
rchar vs pgout: 0.21
sread vs ppgin: 0.21
scall vs ppgout: 0.21
sread vs pgin: 0.21
wchar vs ppgin: 0.20
scall vs pgfree: 0.20
scall vs pgout: 0.19
rchar vs runqsz_CPU_Bound: 0.19
sread vs pgscan: 0.19
sread vs pgout: 0.19
wchar vs pgout: 0.19
lread vs scall: 0.19
pgfree vs pflt: 0.19
lread vs pgin: 0.19
wchar vs ppgout: 0.19
exec vs pgin: 0.19
ppgout vs pflt: 0.19
wchar vs atch: 0.18
pgscan vs pflt: 0.18
wchar vs pgin: 0.18
scall vs pgscan: 0.18
pgin vs pflt: 0.18
rchar vs atch: 0.17
sread vs runqsz_CPU_Bound: 0.17
exec vs rchar: 0.17
fork vs pgfree: 0.17
fork vs ppgout: 0.17
lread vs vflt: 0.17
freemem vs runqsz_Not_CPU_Bound: 0.16
wchar vs runqsz_CPU_Bound: 0.16
sread vs exec: 0.16
fork vs pgin: 0.16
lread vs ppgin: 0.16
fork vs pgscan: 0.16
swrite vs ppgout: 0.16
wchar vs pgfree: 0.16
swrite vs pgout: 0.15
pgout vs pflt: 0.15
ppgin vs pflt: 0.15
exec vs ppgin: 0.15
exec vs ppgout: 0.15
pgout vs atch: 0.15
swrite vs pgin: 0.15
exec vs pgfree: 0.15
swrite vs pgfree: 0.15
exec vs pgscan: 0.14
swrite vs ppgin: 0.14
lwrite vs scall: 0.14
lread vs fork: 0.14
lread vs pflt: 0.14
lread vs sread: 0.13
swrite vs runqsz_CPU_Bound: 0.13
fork vs ppgin: 0.13
lread vs ppgout: 0.13
fork vs pgout: 0.13
lwrite vs sread: 0.13
vflt vs runqsz_CPU_Bound: 0.12
swrite vs pgscan: 0.12
lread vs swrite: 0.12
pflt vs runqsz_CPU_Bound: 0.12
lwrite vs rchar: 0.12
lread vs pgfree: 0.11
wchar vs pgscan: 0.11
exec vs pgout: 0.11
wchar vs vflt: 0.11
lread vs exec: 0.11
lread vs rchar: 0.11
swrite vs exec: 0.10
lwrite vs swrite: 0.10
atch vs vflt: 0.10
lwrite vs vflt: 0.09
ppgout vs atch: 0.09
fork vs runqsz_CPU_Bound: 0.09
lwrite vs wchar: 0.09
lwrite vs pgin: 0.09
lwrite vs ppgin: 0.09
lread vs pgscan: 0.09
wchar vs pflt: 0.09
sread vs atch: 0.09
lread vs pgout: 0.08
lread vs wchar: 0.08
lwrite vs ppgout: 0.08
scall vs atch: 0.08
pgin vs runqsz_CPU_Bound: 0.07
ppgin vs runqsz_CPU_Bound: 0.07
pgfree vs atch: 0.07
lread vs runqsz_CPU_Bound: 0.07
lwrite vs pflt: 0.07
lwrite vs pgout: 0.07
freeswap vs runqsz_Not_CPU_Bound: 0.07
lwrite vs pgfree: 0.07
swrite vs atch: 0.06
fork vs wchar: 0.06
atch vs pgin: 0.06
atch vs ppgin: 0.06
lwrite vs fork: 0.05
atch vs runqsz_CPU_Bound: 0.05
exec vs atch: 0.05
atch vs pflt: 0.05
lwrite vs runqsz_CPU_Bound: 0.05
exec vs runqsz_CPU_Bound: 0.05
fork vs atch: 0.05
pgfree vs runqsz_CPU_Bound: 0.04
lwrite vs pgscan: 0.04
pgscan vs atch: 0.04
lwrite vs exec: 0.04
pgscan vs runqsz_CPU_Bound: 0.04
ppgout vs runqsz_CPU_Bound: 0.04
lwrite vs atch: 0.03
pgout vs runqsz_CPU_Bound: 0.02
lread vs atch: 0.02
exec vs wchar: 0.00
pgout vs runqsz_Not_CPU_Bound: -0.02
ppgout vs runqsz_Not_CPU_Bound: -0.04
pgscan vs runqsz_Not_CPU_Bound: -0.04
pgfree vs runqsz_Not_CPU_Bound: -0.04
exec vs runqsz_Not_CPU_Bound: -0.05
lwrite vs runqsz_Not_CPU_Bound: -0.05
atch vs runqsz_Not_CPU_Bound: -0.05
freeswap vs runqsz_CPU_Bound: -0.07
lread vs runqsz_Not_CPU_Bound: -0.07
ppgin vs runqsz_Not_CPU_Bound: -0.07
pgin vs runqsz_Not_CPU_Bound: -0.07
lread vs freeswap: -0.08
lread vs freemem: -0.08
atch vs freemem: -0.09
lwrite vs freemem: -0.09
fork vs runqsz_Not_CPU_Bound: -0.09
lwrite vs usr: -0.11
pflt vs freemem: -0.11
pflt vs runqsz_Not_CPU_Bound: -0.12
lwrite vs freeswap: -0.12
vflt vs runqsz_Not_CPU_Bound: -0.12
atch vs freeswap: -0.12
fork vs freemem: -0.12
atch vs usr: -0.13
fork vs freeswap: -0.13
pflt vs freeswap: -0.13
swrite vs runqsz_Not_CPU_Bound: -0.13
lread vs usr: -0.14
wchar vs freemem: -0.15
rchar vs freemem: -0.15
exec vs freeswap: -0.15
exec vs freemem: -0.16
wchar vs runqsz_Not_CPU_Bound: -0.16
freemem vs runqsz_CPU_Bound: -0.16
sread vs runqsz_Not_CPU_Bound: -0.17
pgscan vs freeswap: -0.18
pgscan vs usr: -0.18
pgscan vs freemem: -0.19
rchar vs runqsz_Not_CPU_Bound: -0.19
vflt vs freemem: -0.20
pgfree vs freeswap: -0.21
ppgout vs usr: -0.21
ppgout vs freeswap: -0.21
ppgin vs freemem: -0.22
pgfree vs usr: -0.22
rchar vs freeswap: -0.22
pgout vs usr: -0.22
wchar vs freeswap: -0.23
pgin vs freemem: -0.23
ppgin vs usr: -0.23
pgfree vs freemem: -0.23
swrite vs freeswap: -0.24
pgin vs usr: -0.24
scall vs runqsz_Not_CPU_Bound: -0.24
pgout vs freeswap: -0.25
vflt vs freeswap: -0.25
ppgout vs freemem: -0.25
swrite vs freemem: -0.25
ppgin vs freeswap: -0.25
usr vs runqsz_CPU_Bound: -0.26
pgout vs freemem: -0.27
swrite vs usr: -0.27
pgin vs freeswap: -0.28
sread vs freemem: -0.29
exec vs usr: -0.29
wchar vs usr: -0.29
sread vs freeswap: -0.30
scall vs usr: -0.32
rchar vs usr: -0.33
sread vs usr: -0.33
scall vs freeswap: -0.35
fork vs usr: -0.36
pflt vs usr: -0.37
scall vs freemem: -0.39
vflt vs usr: -0.42
runqsz_CPU_Bound vs runqsz_Not_CPU_Bound: -1.00

Problem 1 - Data Pre-processing¶

In [24]:
df.isnull().sum()
Out[24]:
lread         0
lwrite        0
scall         0
sread         0
swrite        0
fork          0
exec          0
rchar       104
wchar        15
pgout         0
ppgout        0
pgfree        0
pgscan        0
atch          0
pgin          0
ppgin         0
pflt          0
vflt          0
runqsz        0
freemem       0
freeswap      0
usr           0
dtype: int64
In [25]:
#impute null values
df['rchar']=df['rchar'].fillna(df['rchar'].median())
df['wchar']=df['wchar'].fillna(df['wchar'].median())
In [26]:
df.isnull().sum()
Out[26]:
lread       0
lwrite      0
scall       0
sread       0
swrite      0
fork        0
exec        0
rchar       0
wchar       0
pgout       0
ppgout      0
pgfree      0
pgscan      0
atch        0
pgin        0
ppgin       0
pflt        0
vflt        0
runqsz      0
freemem     0
freeswap    0
usr         0
dtype: int64
In [27]:
def remove_outlier(col):
    col = sorted(col)
    Q1, Q3 = np.percentile(col, [25, 75])
    IQR = Q3 - Q1
    lower_range = Q1 - (1.5 * IQR)
    upper_range = Q3 + (1.5 * IQR)
    return lower_range, upper_range

# List to store the lower and upper ranges for each column
ranges = []

# Iterate through numerical columns and remove outliers
for num_col in num_columns:
    lr, ur = remove_outlier(df[num_col])
    print(f"For {num_col}, lower range is {lr} and upper range is {ur}")
    
    # Store the ranges for reference
    ranges.append((num_col, lr, ur))
    
    # Remove outliers in the dataframe
    df[num_col] = np.where(df[num_col] > ur, ur, df[num_col])
    df[num_col] = np.where(df[num_col] < lr, lr, df[num_col])

# Print the dataframe after removing outliers
print(df.head())
For lread, lower range is -25.0 and upper range is 47.0
For lwrite, lower range is -15.0 and upper range is 25.0
For scall, lower range is -2445.875 and upper range is 6775.125
For sread, lower range is -203.5 and upper range is 568.5
For swrite, lower range is -120.0 and upper range is 368.0
For fork, lower range is -2.3000000000000003 and upper range is 4.9
For exec, lower range is -3.6999999999999993 and upper range is 6.699999999999999
For rchar, lower range is -310940.875 and upper range is 611196.125
For wchar, lower range is -101611.125 and upper range is 230625.875
For pgout, lower range is -3.5999999999999996 and upper range is 6.0
For ppgout, lower range is -6.300000000000001 and upper range is 10.5
For pgfree, lower range is -7.5 and upper range is 12.5
For pgscan, lower range is 0.0 and upper range is 0.0
For atch, lower range is -0.8999999999999999 and upper range is 1.5
For pgin, lower range is -13.147500000000003 and upper range is 23.512500000000003
For ppgin, lower range is -19.2 and upper range is 33.6
For pflt, lower range is -176.89999999999998 and upper range is 361.5
For vflt, lower range is -264.20000000000005 and upper range is 561.4000000000001
For freemem, lower range is -2425.875 and upper range is 4659.125
For freeswap, lower range is 10989.5 and upper range is 2762013.5
For usr, lower range is 61.5 and upper range is 113.5
   lread  lwrite   scall  sread  swrite  fork  exec     rchar    wchar  pgout  \
0    1.0     0.0  2147.0   79.0    68.0   0.2   0.2   40671.0  53995.0    0.0   
1    0.0     0.0   170.0   18.0    21.0   0.2   0.2     448.0   8385.0    0.0   
2   15.0     3.0  2162.0  159.0   119.0   2.0   2.4  125473.5  31950.0    0.0   
3    0.0     0.0   160.0   12.0    16.0   0.2   0.2  125473.5   8670.0    0.0   
4    5.0     1.0   330.0   39.0    38.0   0.4   0.4  125473.5  12185.0    0.0   

   ...  pgscan  atch  pgin  ppgin    pflt    vflt         runqsz   freemem  \
0  ...     0.0   0.0   1.6    2.6   16.00   26.40      CPU_Bound  4659.125   
1  ...     0.0   0.0   0.0    0.0   15.63   16.83  Not_CPU_Bound  4659.125   
2  ...     0.0   1.2   6.0    9.4  150.20  220.20  Not_CPU_Bound   702.000   
3  ...     0.0   0.0   0.2    0.2   15.60   16.80  Not_CPU_Bound  4659.125   
4  ...     0.0   0.0   1.0    1.2   37.80   47.60  Not_CPU_Bound   633.000   

    freeswap   usr  
0  1730946.0  95.0  
1  1869002.0  97.0  
2  1021237.0  87.0  
3  1863704.0  98.0  
4  1760253.0  90.0  

[5 rows x 22 columns]
In [28]:
# Set up subplot for boxplot
fig, axes = plt.subplots(figsize=(20,16))

# Draw boxplot for all numerical columns
sns.boxplot(data=df[num_columns], ax=axes)
axes.set_title('Boxplot for Numerical Columns')

# Show the boxplot
plt.show()
No description has been provided for this image
In [29]:
#encoding data
df = pd.get_dummies(df, columns=['runqsz'], drop_first=True)
df
Out[29]:
lread lwrite scall sread swrite fork exec rchar wchar pgout ... pgscan atch pgin ppgin pflt vflt freemem freeswap usr runqsz_Not_CPU_Bound
0 1.0 0.0 2147.0 79.0 68.0 0.2 0.20 40671.0 53995.0 0.0 ... 0.0 0.0 1.6000 2.60 16.00 26.40 4659.125 1730946.0 95.0 False
1 0.0 0.0 170.0 18.0 21.0 0.2 0.20 448.0 8385.0 0.0 ... 0.0 0.0 0.0000 0.00 15.63 16.83 4659.125 1869002.0 97.0 True
2 15.0 3.0 2162.0 159.0 119.0 2.0 2.40 125473.5 31950.0 0.0 ... 0.0 1.2 6.0000 9.40 150.20 220.20 702.000 1021237.0 87.0 True
3 0.0 0.0 160.0 12.0 16.0 0.2 0.20 125473.5 8670.0 0.0 ... 0.0 0.0 0.2000 0.20 15.60 16.80 4659.125 1863704.0 98.0 True
4 5.0 1.0 330.0 39.0 38.0 0.4 0.40 125473.5 12185.0 0.0 ... 0.0 0.0 1.0000 1.20 37.80 47.60 633.000 1760253.0 90.0 True
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
8187 16.0 12.0 3009.0 360.0 244.0 1.6 5.81 405250.0 85282.0 6.0 ... 0.0 0.6 23.5125 33.60 139.28 270.74 387.000 986647.0 80.0 False
8188 4.0 0.0 1596.0 170.0 146.0 2.4 1.80 89489.0 41764.0 3.8 ... 0.0 0.8 3.8000 4.40 122.40 212.60 263.000 1055742.0 90.0 True
8189 16.0 5.0 3116.0 289.0 190.0 0.6 0.60 325948.0 52640.0 0.4 ... 0.0 0.4 23.5125 33.60 60.20 219.80 400.000 969106.0 87.0 True
8190 32.0 25.0 5180.0 254.0 179.0 1.2 1.20 62571.0 29505.0 1.4 ... 0.0 0.4 23.0500 24.25 93.19 202.81 141.000 1022458.0 83.0 False
8191 2.0 0.0 985.0 55.0 46.0 1.6 4.80 111111.0 22256.0 0.0 ... 0.0 0.2 3.4000 6.20 91.80 110.00 659.000 1756514.0 94.0 False

8192 rows × 22 columns

In [30]:
df['runqsz_Not_CPU_Bound'] = df['runqsz_Not_CPU_Bound'].astype('uint8')
In [31]:
#Getting train and test variables
# independent variables
X = df.drop(["usr"], axis=1)
# dependent variable
y = df[["usr"]]
In [32]:
# let's add the intercept to data
import statsmodels.api as sm
X = sm.add_constant(X)
In [33]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8192 entries, 0 to 8191
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   lread                 8192 non-null   float64
 1   lwrite                8192 non-null   float64
 2   scall                 8192 non-null   float64
 3   sread                 8192 non-null   float64
 4   swrite                8192 non-null   float64
 5   fork                  8192 non-null   float64
 6   exec                  8192 non-null   float64
 7   rchar                 8192 non-null   float64
 8   wchar                 8192 non-null   float64
 9   pgout                 8192 non-null   float64
 10  ppgout                8192 non-null   float64
 11  pgfree                8192 non-null   float64
 12  pgscan                8192 non-null   float64
 13  atch                  8192 non-null   float64
 14  pgin                  8192 non-null   float64
 15  ppgin                 8192 non-null   float64
 16  pflt                  8192 non-null   float64
 17  vflt                  8192 non-null   float64
 18  freemem               8192 non-null   float64
 19  freeswap              8192 non-null   float64
 20  usr                   8192 non-null   float64
 21  runqsz_Not_CPU_Bound  8192 non-null   uint8  
dtypes: float64(21), uint8(1)
memory usage: 1.3 MB
In [34]:
#We will now split X and y into train and test sets in a 70:30 ratio.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=1
)
In [35]:
#Fit Linear Model
olsmod = sm.OLS(y_train, X_train)
olsres = olsmod.fit()
In [36]:
# let's print the regression summary
print(olsres.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    usr   R-squared:                       0.796
Model:                            OLS   Adj. R-squared:                  0.795
Method:                 Least Squares   F-statistic:                     1115.
Date:                Thu, 11 Jan 2024   Prob (F-statistic):               0.00
Time:                        07:37:00   Log-Likelihood:                -16657.
No. Observations:                5734   AIC:                         3.336e+04
Df Residuals:                    5713   BIC:                         3.350e+04
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
========================================================================================
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                   84.1217      0.316    266.106      0.000      83.502      84.741
lread                   -0.0635      0.009     -7.071      0.000      -0.081      -0.046
lwrite                   0.0482      0.013      3.671      0.000       0.022       0.074
scall                   -0.0007   6.28e-05    -10.566      0.000      -0.001      -0.001
sread                    0.0003      0.001      0.305      0.760      -0.002       0.002
swrite                  -0.0054      0.001     -3.777      0.000      -0.008      -0.003
fork                     0.0293      0.132      0.222      0.824      -0.229       0.288
exec                    -0.3212      0.052     -6.220      0.000      -0.422      -0.220
rchar                -5.167e-06   4.88e-07    -10.598      0.000   -6.12e-06   -4.21e-06
wchar                -5.403e-06   1.03e-06     -5.232      0.000   -7.43e-06   -3.38e-06
pgout                   -0.3688      0.090     -4.098      0.000      -0.545      -0.192
ppgout                  -0.0766      0.079     -0.973      0.330      -0.231       0.078
pgfree                   0.0845      0.048      1.769      0.077      -0.009       0.178
pgscan                5.192e-14   2.39e-16    216.826      0.000    5.15e-14    5.24e-14
atch                     0.6276      0.143      4.394      0.000       0.348       0.908
pgin                     0.0200      0.028      0.703      0.482      -0.036       0.076
ppgin                   -0.0673      0.020     -3.415      0.001      -0.106      -0.029
pflt                    -0.0336      0.002    -16.957      0.000      -0.037      -0.030
vflt                    -0.0055      0.001     -3.830      0.000      -0.008      -0.003
freemem                 -0.0005   5.07e-05     -9.038      0.000      -0.001      -0.000
freeswap              8.832e-06    1.9e-07     46.472      0.000    8.46e-06     9.2e-06
runqsz_Not_CPU_Bound     1.6153      0.126     12.819      0.000       1.368       1.862
==============================================================================
Omnibus:                     1103.645   Durbin-Watson:                   2.016
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             2372.553
Skew:                          -1.119   Prob(JB):                         0.00
Kurtosis:                       5.219   Cond. No.                     2.92e+22
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.34e-29. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
In [37]:
# let's check the VIF of the predictors
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif_series1 = pd.Series(
    [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])],
    index=X_train.columns,
)
print("VIF values: \n\n{}\n".format(vif_series1))
VIF values: 

const                   29.229332
lread                    5.350560
lwrite                   4.328397
scall                    2.960609
sread                    6.420172
swrite                   5.597135
fork                    13.035359
exec                     3.241417
rchar                    2.133616
wchar                    1.584381
pgout                   11.360363
ppgout                  29.404223
pgfree                  16.496748
pgscan                        NaN
atch                     1.875901
pgin                    13.809339
ppgin                   13.951855
pflt                    12.001460
vflt                    15.971049
freemem                  1.961304
freeswap                 1.841239
runqsz_Not_CPU_Bound     1.156815
dtype: float64

In [38]:
variables_to_drop = ["atch", "pgin", "ppgin", "pflt", "vflt", "pgfree", "ppgout", "pgout", "fork", "exec", "swrite", "sread", "lwrite", "lread", "pgscan", "runqsz_Not_CPU_Bound"]

for variable in variables_to_drop:
    X_train_temp = X_train.drop([variable], axis=1)
    ols_model_temp = sm.OLS(y_train, X_train_temp)
    ols_res_temp = ols_model_temp.fit()
    
    r_squared_diff = 0.795 - ols_res_temp.rsquared_adj
    
    
    print(f"On dropping '{variable}', Adjusted R-squared minus Adjusted R-squared is {np.round(r_squared_diff, 3)}")
    print(f"R-squared: {np.round(ols_res_temp.rsquared, 3)}\nAdjusted R-squared: {np.round(ols_res_temp.rsquared_adj, 3)}\n")
    
On dropping 'atch', Adjusted R-squared minus Adjusted R-squared is 0.0
R-squared: 0.795
Adjusted R-squared: 0.795

On dropping 'pgin', Adjusted R-squared minus Adjusted R-squared is -0.0
R-squared: 0.796
Adjusted R-squared: 0.795

On dropping 'ppgin', Adjusted R-squared minus Adjusted R-squared is -0.0
R-squared: 0.796
Adjusted R-squared: 0.795

On dropping 'pflt', Adjusted R-squared minus Adjusted R-squared is 0.01
R-squared: 0.786
Adjusted R-squared: 0.785

On dropping 'vflt', Adjusted R-squared minus Adjusted R-squared is 0.0
R-squared: 0.796
Adjusted R-squared: 0.795

On dropping 'pgfree', Adjusted R-squared minus Adjusted R-squared is -0.0
R-squared: 0.796
Adjusted R-squared: 0.795

On dropping 'ppgout', Adjusted R-squared minus Adjusted R-squared is -0.0
R-squared: 0.796
Adjusted R-squared: 0.795

On dropping 'pgout', Adjusted R-squared minus Adjusted R-squared is 0.0
R-squared: 0.796
Adjusted R-squared: 0.795

On dropping 'fork', Adjusted R-squared minus Adjusted R-squared is -0.0
R-squared: 0.796
Adjusted R-squared: 0.795

On dropping 'exec', Adjusted R-squared minus Adjusted R-squared is 0.001
R-squared: 0.795
Adjusted R-squared: 0.794

On dropping 'swrite', Adjusted R-squared minus Adjusted R-squared is 0.0
R-squared: 0.796
Adjusted R-squared: 0.795

On dropping 'sread', Adjusted R-squared minus Adjusted R-squared is -0.0
R-squared: 0.796
Adjusted R-squared: 0.795

On dropping 'lwrite', Adjusted R-squared minus Adjusted R-squared is 0.0
R-squared: 0.796
Adjusted R-squared: 0.795

On dropping 'lread', Adjusted R-squared minus Adjusted R-squared is 0.001
R-squared: 0.794
Adjusted R-squared: 0.794

On dropping 'pgscan', Adjusted R-squared minus Adjusted R-squared is -0.0
R-squared: 0.796
Adjusted R-squared: 0.795

On dropping 'runqsz_Not_CPU_Bound', Adjusted R-squared minus Adjusted R-squared is 0.005
R-squared: 0.79
Adjusted R-squared: 0.79

Since there is a very small effect (0.001) on adj. R-squared after dropping the list in variable "variables_to_drop", we can remove it from the training set.

In [39]:
X_train = X_train.drop(["pgout", "ppgin", "ppgout", "pgscan", "fork"], axis=1)
In [40]:
olsmod_2 = sm.OLS(y_train, X_train)
olsres_2 = olsmod_2.fit()
print(olsres_2.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    usr   R-squared:                       0.794
Model:                            OLS   Adj. R-squared:                  0.794
Method:                 Least Squares   F-statistic:                     1380.
Date:                Thu, 11 Jan 2024   Prob (F-statistic):               0.00
Time:                        07:37:00   Log-Likelihood:                -16683.
No. Observations:                5734   AIC:                         3.340e+04
Df Residuals:                    5717   BIC:                         3.351e+04
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
========================================================================================
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                   84.1524      0.314    268.183      0.000      83.537      84.768
lread                   -0.0647      0.009     -7.201      0.000      -0.082      -0.047
lwrite                   0.0489      0.013      3.718      0.000       0.023       0.075
scall                   -0.0007   6.26e-05    -10.742      0.000      -0.001      -0.001
sread                    0.0002      0.001      0.225      0.822      -0.002       0.002
swrite                  -0.0052      0.001     -3.668      0.000      -0.008      -0.002
exec                    -0.3086      0.050     -6.213      0.000      -0.406      -0.211
rchar                -5.297e-06   4.85e-07    -10.911      0.000   -6.25e-06   -4.34e-06
wchar                -5.799e-06   1.03e-06     -5.607      0.000   -7.83e-06   -3.77e-06
pgfree                  -0.1109      0.016     -6.761      0.000      -0.143      -0.079
atch                     0.4034      0.138      2.921      0.003       0.133       0.674
pgin                    -0.0738      0.010     -7.483      0.000      -0.093      -0.054
pflt                    -0.0337      0.002    -18.277      0.000      -0.037      -0.030
vflt                    -0.0051      0.001     -4.023      0.000      -0.008      -0.003
freemem                 -0.0004   5.08e-05     -8.749      0.000      -0.001      -0.000
freeswap               8.82e-06   1.89e-07     46.581      0.000    8.45e-06    9.19e-06
runqsz_Not_CPU_Bound     1.5596      0.126     12.354      0.000       1.312       1.807
==============================================================================
Omnibus:                     1086.134   Durbin-Watson:                   2.015
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             2304.413
Skew:                          -1.108   Prob(JB):                         0.00
Kurtosis:                       5.176   Cond. No.                     7.64e+06
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 7.64e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
In [41]:
vif_series2 = pd.Series(
    [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])],
    index=X_train.columns,
)
print("VIF values: \n\n{}\n".format(vif_series2))
VIF values: 

const                   28.561995
lread                    5.312644
lwrite                   4.305704
scall                    2.913380
sread                    6.402072
swrite                   5.422510
exec                     2.975076
rchar                    2.098192
wchar                    1.576027
pgfree                   1.930990
atch                     1.738475
pgin                     1.648652
pflt                    10.324080
vflt                    12.356350
freemem                  1.953053
freeswap                 1.812799
runqsz_Not_CPU_Bound     1.151692
dtype: float64

In [42]:
variables_to_drop2 = ["swrite", "sread", "lread", "lwrite", "pflt", "vflt", "atch", "exec"]

for variable in variables_to_drop2:
    X_train_temp = X_train.drop([variable], axis=1)
    ols_model_temp = sm.OLS(y_train, X_train_temp)
    ols_res_temp = ols_model_temp.fit()
    
    r_squared_diff = 0.795 - ols_res_temp.rsquared_adj
    
    print(f"On dropping '{variable}', Adjusted R-squared minus Adjusted R-squared is {np.round(r_squared_diff, 3)}")
    print(f"R-squared: {np.round(ols_res_temp.rsquared, 3)}\nAdjusted R-squared: {np.round(ols_res_temp.rsquared_adj, 3)}\n")
On dropping 'swrite', Adjusted R-squared minus Adjusted R-squared is 0.002
R-squared: 0.794
Adjusted R-squared: 0.793

On dropping 'sread', Adjusted R-squared minus Adjusted R-squared is 0.001
R-squared: 0.794
Adjusted R-squared: 0.794

On dropping 'lread', Adjusted R-squared minus Adjusted R-squared is 0.003
R-squared: 0.792
Adjusted R-squared: 0.792

On dropping 'lwrite', Adjusted R-squared minus Adjusted R-squared is 0.002
R-squared: 0.794
Adjusted R-squared: 0.793

On dropping 'pflt', Adjusted R-squared minus Adjusted R-squared is 0.013
R-squared: 0.782
Adjusted R-squared: 0.782

On dropping 'vflt', Adjusted R-squared minus Adjusted R-squared is 0.002
R-squared: 0.794
Adjusted R-squared: 0.793

On dropping 'atch', Adjusted R-squared minus Adjusted R-squared is 0.002
R-squared: 0.794
Adjusted R-squared: 0.793

On dropping 'exec', Adjusted R-squared minus Adjusted R-squared is 0.003
R-squared: 0.793
Adjusted R-squared: 0.792

In [43]:
X_train = X_train.drop(["vflt", "swrite", "sread"], axis=1)
In [44]:
olsmod_3 = sm.OLS(y_train, X_train)
olsres_3 = olsmod_3.fit()
print(olsres_3.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    usr   R-squared:                       0.793
Model:                            OLS   Adj. R-squared:                  0.792
Method:                 Least Squares   F-statistic:                     1682.
Date:                Thu, 11 Jan 2024   Prob (F-statistic):               0.00
Time:                        07:37:01   Log-Likelihood:                -16705.
No. Observations:                5734   AIC:                         3.344e+04
Df Residuals:                    5720   BIC:                         3.353e+04
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
========================================================================================
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                   83.8075      0.310    270.281      0.000      83.200      84.415
lread                   -0.0681      0.009     -7.581      0.000      -0.086      -0.050
lwrite                   0.0519      0.013      3.943      0.000       0.026       0.078
scall                   -0.0009   4.86e-05    -17.694      0.000      -0.001      -0.001
exec                    -0.2682      0.046     -5.779      0.000      -0.359      -0.177
rchar                -5.509e-06   4.35e-07    -12.670      0.000   -6.36e-06   -4.66e-06
wchar                 -6.75e-06   9.83e-07     -6.868      0.000   -8.68e-06   -4.82e-06
pgfree                  -0.1170      0.016     -7.134      0.000      -0.149      -0.085
atch                     0.3907      0.138      2.825      0.005       0.120       0.662
pgin                    -0.0853      0.010     -8.961      0.000      -0.104      -0.067
pflt                    -0.0420      0.001    -42.696      0.000      -0.044      -0.040
freemem                 -0.0004   5.07e-05     -8.351      0.000      -0.001      -0.000
freeswap              8.984e-06   1.87e-07     47.951      0.000    8.62e-06    9.35e-06
runqsz_Not_CPU_Bound     1.5512      0.127     12.243      0.000       1.303       1.800
==============================================================================
Omnibus:                      991.432   Durbin-Watson:                   2.012
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             2024.692
Skew:                          -1.034   Prob(JB):                         0.00
Kurtosis:                       5.048   Cond. No.                     7.53e+06
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 7.53e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
In [45]:
vif_series3 = pd.Series(
    [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])],
    index=X_train.columns,
)
print("VIF values: \n\n{}\n".format(vif_series3))
VIF values: 

const                   27.685755
lread                    5.273659
lwrite                   4.286869
scall                    1.742585
exec                     2.577562
rchar                    1.670597
wchar                    1.412736
pgfree                   1.917857
atch                     1.731517
pgin                     1.524691
pflt                     2.914826
freemem                  1.930797
freeswap                 1.761721
runqsz_Not_CPU_Bound     1.151418
dtype: float64

In [ ]:
 
In [46]:
variables_to_drop3 = ["lwrite", "exec", "pflt", "lread", "atch"]

for variable in variables_to_drop3:
    X_train_temp = X_train.drop([variable], axis=1)
    ols_model_temp = sm.OLS(y_train, X_train_temp)
    ols_res_temp = ols_model_temp.fit()
    
    r_squared_diff = 0.795 - ols_res_temp.rsquared_adj
    
    print(f"On dropping '{variable}', Adjusted R-squared minus Adjusted R-squared is {np.round(r_squared_diff, 3)}")
    print(f"R-squared: {np.round(ols_res_temp.rsquared, 3)}\nAdjusted R-squared: {np.round(ols_res_temp.rsquared_adj, 3)}\n")
On dropping 'lwrite', Adjusted R-squared minus Adjusted R-squared is 0.003
R-squared: 0.792
Adjusted R-squared: 0.792

On dropping 'exec', Adjusted R-squared minus Adjusted R-squared is 0.004
R-squared: 0.791
Adjusted R-squared: 0.791

On dropping 'pflt', Adjusted R-squared minus Adjusted R-squared is 0.069
R-squared: 0.727
Adjusted R-squared: 0.726

On dropping 'lread', Adjusted R-squared minus Adjusted R-squared is 0.005
R-squared: 0.791
Adjusted R-squared: 0.79

On dropping 'atch', Adjusted R-squared minus Adjusted R-squared is 0.003
R-squared: 0.792
Adjusted R-squared: 0.792

In [47]:
X_train = X_train.drop(["exec", "lwrite"], axis=1)
In [ ]:
 
In [48]:
olsmod_4 = sm.OLS(y_train, X_train)
olsres_4 = olsmod_4.fit()
print(olsres_4.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    usr   R-squared:                       0.791
Model:                            OLS   Adj. R-squared:                  0.790
Method:                 Least Squares   F-statistic:                     1966.
Date:                Thu, 11 Jan 2024   Prob (F-statistic):               0.00
Time:                        07:37:01   Log-Likelihood:                -16731.
No. Observations:                5734   AIC:                         3.349e+04
Df Residuals:                    5722   BIC:                         3.357e+04
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
========================================================================================
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                   83.8133      0.311    269.545      0.000      83.204      84.423
lread                   -0.0396      0.004     -8.900      0.000      -0.048      -0.031
scall                   -0.0009   4.87e-05    -18.200      0.000      -0.001      -0.001
rchar                -5.528e-06   4.37e-07    -12.660      0.000   -6.38e-06   -4.67e-06
wchar                -6.343e-06   9.84e-07     -6.445      0.000   -8.27e-06   -4.41e-06
pgfree                  -0.1202      0.016     -7.298      0.000      -0.152      -0.088
atch                     0.3536      0.139      2.550      0.011       0.082       0.625
pgin                    -0.0957      0.009    -10.144      0.000      -0.114      -0.077
pflt                    -0.0467      0.001    -64.433      0.000      -0.048      -0.045
freemem                 -0.0004   5.09e-05     -8.116      0.000      -0.001      -0.000
freeswap              8.997e-06   1.88e-07     47.815      0.000    8.63e-06    9.37e-06
runqsz_Not_CPU_Bound     1.6020      0.127     12.627      0.000       1.353       1.851
==============================================================================
Omnibus:                      943.471   Durbin-Watson:                   2.014
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1880.411
Skew:                          -0.999   Prob(JB):                         0.00
Kurtosis:                       4.969   Cond. No.                     7.52e+06
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 7.52e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
In [49]:
vif_series4 = pd.Series(
    [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])],
    index=X_train.columns,
)
print("VIF values: \n\n{}\n".format(vif_series4))
VIF values: 

const                   27.599007
lread                    1.284916
scall                    1.732868
rchar                    1.670331
wchar                    1.404285
pgfree                   1.915050
atch                     1.725226
pgin                     1.484113
pflt                     1.567345
freemem                  1.929209
freeswap                 1.761537
runqsz_Not_CPU_Bound     1.144543
dtype: float64

In [ ]:
 

Assumptions of Linear Regression¶

In [50]:
df_pred = pd.DataFrame()

df_pred["Actual Values"] = y_train.values.flatten()  # actual values
df_pred["Fitted Values"] = olsres_4.fittedvalues.values  # predicted values
df_pred["Residuals"] = olsres_4.resid.values  # residuals

df_pred.head()
Out[50]:
Actual Values Fitted Values Residuals
0 91.0 89.693813 1.306187
1 94.0 91.604172 2.395828
2 61.5 74.803965 -13.303965
3 83.0 80.799348 2.200652
4 94.0 98.105864 -4.105864
In [51]:
# let us plot the fitted values vs residuals
sns.set_style("whitegrid")
sns.residplot(
    data=df_pred, x="Fitted Values", y="Residuals", color="purple", lowess=True
)
plt.xlabel("Fitted Values")
plt.ylabel("Residuals")
plt.title("Fitted vs Residual plot")
plt.show()
No description has been provided for this image
No pattern in the data thus the assumption of linearity and independence of predictors satisfied¶
In [52]:
# columns in training set
X_train.columns
Out[52]:
Index(['const', 'lread', 'scall', 'rchar', 'wchar', 'pgfree', 'atch', 'pgin',
       'pflt', 'freemem', 'freeswap', 'runqsz_Not_CPU_Bound'],
      dtype='object')
In [53]:
# checking the distribution of variables in training set with dependent variable
sns_plot = sns.pairplot(df[['usr', 'lread', 'scall', 'rchar', 'wchar', 'pgfree', 'atch', 'pgin',
       'pflt', 'freemem', 'freeswap', 'runqsz_Not_CPU_Bound']])
sns_plot.figure.savefig("pairplot.png")
plt.show()
No description has been provided for this image
In [54]:
# using square transformation
X_train["scall_sq"] = np.square(X_train["scall"])

# let's create a model with the transformed data
olsmod_5 = sm.OLS(y_train, X_train)
olsres_5 = olsmod_5.fit()
print(olsres_5.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    usr   R-squared:                       0.799
Model:                            OLS   Adj. R-squared:                  0.799
Method:                 Least Squares   F-statistic:                     1895.
Date:                Thu, 11 Jan 2024   Prob (F-statistic):               0.00
Time:                        07:38:00   Log-Likelihood:                -16616.
No. Observations:                5734   AIC:                         3.326e+04
Df Residuals:                    5721   BIC:                         3.335e+04
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
========================================================================================
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                   80.9943      0.356    227.462      0.000      80.296      81.692
lread                   -0.0404      0.004     -9.253      0.000      -0.049      -0.032
scall                    0.0011      0.000      7.979      0.000       0.001       0.001
rchar                -5.472e-06   4.28e-07    -12.784      0.000   -6.31e-06   -4.63e-06
wchar                -6.973e-06   9.66e-07     -7.222      0.000   -8.87e-06   -5.08e-06
pgfree                  -0.1095      0.016     -6.775      0.000      -0.141      -0.078
atch                     0.3592      0.136      2.643      0.008       0.093       0.626
pgin                    -0.1083      0.009    -11.664      0.000      -0.127      -0.090
pflt                    -0.0476      0.001    -66.780      0.000      -0.049      -0.046
freemem                 -0.0003   5.08e-05     -5.359      0.000      -0.000      -0.000
freeswap              9.468e-06   1.87e-07     50.635      0.000     9.1e-06    9.83e-06
runqsz_Not_CPU_Bound     1.8196      0.125     14.537      0.000       1.574       2.065
scall_sq             -3.192e-07   2.08e-08    -15.312      0.000    -3.6e-07   -2.78e-07
==============================================================================
Omnibus:                      867.663   Durbin-Watson:                   2.012
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1685.469
Skew:                          -0.938   Prob(JB):                         0.00
Kurtosis:                       4.881   Cond. No.                     7.71e+07
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 7.71e+07. This might indicate that there are
strong multicollinearity or other numerical problems.
In [55]:
# using square transformation
X_train["pflt_sq"] = np.square(X_train["pflt"])

# let's create a model with the transformed data
olsmod_6 = sm.OLS(y_train, X_train)
olsres_6 = olsmod_6.fit()
print(olsres_6.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    usr   R-squared:                       0.803
Model:                            OLS   Adj. R-squared:                  0.802
Method:                 Least Squares   F-statistic:                     1788.
Date:                Thu, 11 Jan 2024   Prob (F-statistic):               0.00
Time:                        07:38:00   Log-Likelihood:                -16565.
No. Observations:                5734   AIC:                         3.316e+04
Df Residuals:                    5720   BIC:                         3.325e+04
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
========================================================================================
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                   80.3258      0.359    223.722      0.000      79.622      81.030
lread                   -0.0406      0.004     -9.385      0.000      -0.049      -0.032
scall                    0.0009      0.000      6.575      0.000       0.001       0.001
rchar                -5.569e-06   4.24e-07    -13.124      0.000    -6.4e-06   -4.74e-06
wchar                -7.545e-06   9.59e-07     -7.870      0.000   -9.42e-06   -5.67e-06
pgfree                  -0.1040      0.016     -6.489      0.000      -0.135      -0.073
atch                     0.3076      0.135      2.282      0.023       0.043       0.572
pgin                    -0.1148      0.009    -12.443      0.000      -0.133      -0.097
pflt                    -0.0271      0.002    -12.676      0.000      -0.031      -0.023
freemem                 -0.0003   5.03e-05     -5.551      0.000      -0.000      -0.000
freeswap              9.528e-06   1.85e-07     51.383      0.000    9.16e-06    9.89e-06
runqsz_Not_CPU_Bound     1.8713      0.124     15.070      0.000       1.628       2.115
scall_sq             -2.861e-07   2.09e-08    -13.678      0.000   -3.27e-07   -2.45e-07
pflt_sq              -6.019e-05   5.93e-06    -10.147      0.000   -7.18e-05   -4.86e-05
==============================================================================
Omnibus:                      839.958   Durbin-Watson:                   2.002
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1613.180
Skew:                          -0.916   Prob(JB):                         0.00
Kurtosis:                       4.843   Cond. No.                     7.84e+07
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 7.84e+07. This might indicate that there are
strong multicollinearity or other numerical problems.
In [57]:
# let us recreate the dataframe with actual, fitted and residual values
df_pred = pd.DataFrame()

df_pred["Actual Values"] = y_train.values.flatten()  # actual values
df_pred["Fitted Values"] = olsres_6.fittedvalues.values  # predicted values
df_pred["Residuals"] = olsres_6.resid.values  # residuals

df_pred.head()
Out[57]:
Actual Values Fitted Values Residuals
0 91.0 89.130465 1.869535
1 94.0 91.341269 2.658731
2 61.5 74.962262 -13.462262
3 83.0 81.713945 1.286055
4 94.0 97.136625 -3.136625
In [58]:
# let us plot the fitted values vs residuals
sns.set_style("whitegrid")
sns.residplot(
    data=df_pred, x="Fitted Values", y="Residuals", color="purple", lowess=True
)
plt.xlabel("Fitted Values")
plt.ylabel("Residuals")
plt.title("Fitted vs Residual plot")
plt.show()
No description has been provided for this image

Test for Normality¶

In [59]:
sns.histplot(df_pred["Residuals"], kde=True)
plt.title("Normality of residuals")
plt.show()
No description has been provided for this image
In [60]:
w, p1_value = stats.shapiro(df_pred["Residuals"])

print('The p-value for Residuals is', p1_value)

alpha = 0.05  # Set your significance level
if p1_value > alpha:
    print("The data appears to be normally distributed (fail to reject H0)")
else:
    print("The data does not appear to be normally distributed (reject H0)")
The p-value for Residuals is 3.891742147061455e-39
The data does not appear to be normally distributed (reject H0)
In [61]:
import pylab
import scipy.stats as stats

stats.probplot(df_pred["Residuals"], dist="norm", plot=pylab)
plt.show()
No description has been provided for this image
In [62]:
X_train.skew()
Out[62]:
const                   0.000000
lread                   1.208855
scall                   0.715733
rchar                   1.119972
wchar                   1.154196
pgfree                  1.195839
atch                    1.160107
pgin                    1.239660
pflt                    1.200329
freemem                 1.196730
freeswap               -0.779456
runqsz_Not_CPU_Bound   -0.120234
scall_sq                1.922564
pflt_sq                 2.019616
dtype: float64

Test for Homoscedasticity¶

The null and alternate hypotheses of the goldfeldquandt test are as follows:

  • Null hypothesis : Residuals are homoscedastic
  • Alternate hypothesis : Residuals have hetroscedasticity
In [63]:
import statsmodels.stats.api as sms
sms.het_goldfeldquandt(df_pred["Residuals"], X_train)[1]
Out[63]:
0.011332917693381926
Since p-value < 0.05 we can say that the residuals are hetroscedasticity.¶
In [ ]:
 

The model built olsmod_6 satisfies some assumptions of Linear Regression¶

In [64]:
olsres_6.summary()
Out[64]:
OLS Regression Results
Dep. Variable: usr R-squared: 0.803
Model: OLS Adj. R-squared: 0.802
Method: Least Squares F-statistic: 1788.
Date: Thu, 11 Jan 2024 Prob (F-statistic): 0.00
Time: 07:39:46 Log-Likelihood: -16565.
No. Observations: 5734 AIC: 3.316e+04
Df Residuals: 5720 BIC: 3.325e+04
Df Model: 13
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
const 80.3258 0.359 223.722 0.000 79.622 81.030
lread -0.0406 0.004 -9.385 0.000 -0.049 -0.032
scall 0.0009 0.000 6.575 0.000 0.001 0.001
rchar -5.569e-06 4.24e-07 -13.124 0.000 -6.4e-06 -4.74e-06
wchar -7.545e-06 9.59e-07 -7.870 0.000 -9.42e-06 -5.67e-06
pgfree -0.1040 0.016 -6.489 0.000 -0.135 -0.073
atch 0.3076 0.135 2.282 0.023 0.043 0.572
pgin -0.1148 0.009 -12.443 0.000 -0.133 -0.097
pflt -0.0271 0.002 -12.676 0.000 -0.031 -0.023
freemem -0.0003 5.03e-05 -5.551 0.000 -0.000 -0.000
freeswap 9.528e-06 1.85e-07 51.383 0.000 9.16e-06 9.89e-06
runqsz_Not_CPU_Bound 1.8713 0.124 15.070 0.000 1.628 2.115
scall_sq -2.861e-07 2.09e-08 -13.678 0.000 -3.27e-07 -2.45e-07
pflt_sq -6.019e-05 5.93e-06 -10.147 0.000 -7.18e-05 -4.86e-05
Omnibus: 839.958 Durbin-Watson: 2.002
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1613.180
Skew: -0.916 Prob(JB): 0.00
Kurtosis: 4.843 Cond. No. 7.84e+07


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 7.84e+07. This might indicate that there are
strong multicollinearity or other numerical problems.

The model equation will be as follows:¶

In [65]:
# Let us write the equation of linear regression
Equation = "usr ="
print(Equation, end=" ")
for i in range(len(X_train.columns)):
    if i == 0:
        print(olsres_6.params[i], "+", end=" ")
    elif i != len(X_train.columns) - 1:
        print(
            olsres_6.params[i],
            "* (",
            X_train.columns[i],
            ")",
            "+",
            end="  ",
        )
    else:
        print(olsres_6.params[i], "* (", X_train.columns[i], ")")
usr = 80.32578269641903 + -0.04060862245726409 * ( lread ) +  0.0009109432268011643 * ( scall ) +  -5.5691044371107365e-06 * ( rchar ) +  -7.5446719419862535e-06 * ( wchar ) +  -0.10398102553887754 * ( pgfree ) +  0.30760866082751137 * ( atch ) +  -0.1148167884326387 * ( pgin ) +  -0.0271225198213273 * ( pflt ) +  -0.0002793470072376525 * ( freemem ) +  9.5281546836217e-06 * ( freeswap ) +  1.871281340030479 * ( runqsz_Not_CPU_Bound ) +  -2.8613266551192783e-07 * ( scall_sq ) +  -6.018855684078149e-05 * ( pflt_sq )

Predictions¶

In [66]:
X_train.columns
Out[66]:
Index(['const', 'lread', 'scall', 'rchar', 'wchar', 'pgfree', 'atch', 'pgin',
       'pflt', 'freemem', 'freeswap', 'runqsz_Not_CPU_Bound', 'scall_sq',
       'pflt_sq'],
      dtype='object')
In [67]:
X_test.columns
Out[67]:
Index(['const', 'lread', 'lwrite', 'scall', 'sread', 'swrite', 'fork', 'exec',
       'rchar', 'wchar', 'pgout', 'ppgout', 'pgfree', 'pgscan', 'atch', 'pgin',
       'ppgin', 'pflt', 'vflt', 'freemem', 'freeswap', 'runqsz_Not_CPU_Bound'],
      dtype='object')
In [70]:
# dropping columns from the test data that are not there in the training data
X_test2 = X_test.drop(
    ["exec", "lwrite", "vflt", "swrite", "sread", "pgout", "ppgin", "ppgout", "pgscan", "fork"], axis=1
)
In [73]:
# transforming the weight column in the test data corresponding to the training set
X_test2["scall_sq"] = np.square(X_test2["scall"])
X_test2["pflt_sq"] = np.square(X_test2["pflt"])
In [74]:
# let's make predictions on the test set
y_pred_test = olsres_6.predict(X_test2)
y_pred_train = olsres_6.predict(X_train)
In [75]:
# To check model performance
from sklearn.metrics import mean_absolute_error, mean_squared_error
In [76]:
# let's check the RMSE on the train data
rmse1 = np.sqrt(mean_squared_error(y_train, y_pred_train))
rmse1
Out[76]:
4.349243018051866
In [77]:
# let's check the RMSE on the test data
rmse2 = np.sqrt(mean_squared_error(y_test, y_pred_test))
rmse2
Out[77]:
4.542132394160572
In [78]:
# let's check the MAE on the train data
mae1 = mean_absolute_error(y_train, df_pred["Fitted Values"])
mae1
Out[78]:
3.2083224632361858
In [79]:
# let's check the MAE on the test data
mae2 = mean_absolute_error(y_test, y_pred_test)
mae2
Out[79]:
3.260139643316443

Sci-Kit Learn Linear Regression¶

In [80]:
# invoke the LinearRegression function and find the bestfit model on training data
from sklearn.linear_model import LinearRegression
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)
Out[80]:
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [81]:
# Let us explore the coefficients for each of the independent attributes

for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))
The coefficient for const is 0.0
The coefficient for lread is -0.040608622458206595
The coefficient for scall is 0.0009109432268176952
The coefficient for rchar is -5.569104437094579e-06
The coefficient for wchar is -7.544671942038639e-06
The coefficient for pgfree is -0.10398102553937687
The coefficient for atch is 0.3076086608290831
The coefficient for pgin is -0.11481678843273727
The coefficient for pflt is -0.02712251982132653
The coefficient for freemem is -0.0002793470072352273
The coefficient for freeswap is 9.528154683609963e-06
The coefficient for runqsz_Not_CPU_Bound is 1.8712813400322006
The coefficient for scall_sq is -2.861326655126395e-07
The coefficient for pflt_sq is -6.018855684120802e-05
In [82]:
# Let us check the intercept for the model

intercept = regression_model.intercept_[0]

print("The intercept for our model is {}".format(intercept))
The intercept for our model is 80.32578269653469
In [83]:
# R square on training data
regression_model.score(X_train, y_train)
Out[83]:
0.8025428491193711
In [84]:
# R square on testing data
regression_model.score(X_test2, y_test)
Out[84]:
0.7786015387667776
In [85]:
#RMSE on Training data
from sklearn import metrics

predicted_train=regression_model.fit(X_train, y_train).predict(X_train)
np.sqrt(metrics.mean_squared_error(y_train,predicted_train))
Out[85]:
4.349243018051866
In [86]:
#RMSE on Testing data
predicted_test=regression_model.fit(X_train, y_train).predict(X_test2)
np.sqrt(metrics.mean_squared_error(y_test,predicted_test))
Out[86]:
4.542132394160525
In [ ]: